From ba7f5c312a55118f2dbd7db78a3a729f7c15732a Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 28 Sep 2020 14:01:47 +0100 Subject: [PATCH 01/89] Preflight checks --- flowmachine/flowmachine/core/preflight.py | 72 +++++++++++++++++++ flowmachine/flowmachine/core/query.py | 8 ++- .../query_schemas/base_exposed_query.py | 1 + flowmachine/flowmachine/core/spatial_unit.py | 11 +-- .../flowmachine/core/sqlalchemy_utils.py | 8 ++- flowmachine/flowmachine/core/table.py | 22 +++--- .../features/network/total_network_objects.py | 8 +-- .../features/utilities/event_table_subset.py | 5 ++ .../features/utilities/events_tables_union.py | 4 +- 9 files changed, 113 insertions(+), 26 deletions(-) create mode 100644 flowmachine/flowmachine/core/preflight.py diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py new file mode 100644 index 0000000000..037d758c61 --- /dev/null +++ b/flowmachine/flowmachine/core/preflight.py @@ -0,0 +1,72 @@ +import inspect +from collections import defaultdict +from functools import wraps + +import typing + +from flowmachine.core.errors.flowmachine_errors import QueryErroredException + + +def pre_flight(method): + method.__hooks__ = getattr(method, "__hooks__", {}) + method.__hooks__["pre_flight"] = method + + @wraps(method) + def _impl(self): + return method(self) + + return _impl + + +def resolve_hooks(cls) -> typing.Dict[str, typing.List[typing.Callable]]: + """Add in the decorated processors + By doing this after constructing the class, we let standard inheritance + do all the hard work. + """ + mro = inspect.getmro(cls) + + hooks = defaultdict(list) + + for attr_name in dir(cls): + # Need to look up the actual descriptor, not whatever might be + # bound to the class. This needs to come from the __dict__ of the + # declaring class. + for parent in mro: + try: + attr = parent.__dict__[attr_name] + except KeyError: + continue + else: + break + else: + # In case we didn't find the attribute and didn't break above. + # We should never hit this - it's just here for completeness + # to exclude the possibility of attr being undefined. + continue + + try: + hook_config = attr.__hook__ + except AttributeError: + pass + else: + for key in hook_config.keys(): + # Use name here so we can get the bound method later, in + # case the processor was a descriptor or something. + hooks[key].append(attr_name) + + return hooks + + +class Preflight: + def preflight(self): + errors = [] + for dependency in [*self.dependencies, self]: + for hook in resolve_hooks(dependency)["preflight"]: + try: + hook() + except Exception as e: + errors.append(e) + if len(errors) > 0: + raise QueryErroredException( + f"Pre-flight failed for '{self.query_id}'. Errors: {errors}" + ) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index a1959014b0..ff3cd89abb 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -29,7 +29,11 @@ get_redis, submit_to_executor, ) -from flowmachine.core.errors.flowmachine_errors import QueryResetFailedException +from flowmachine.core.errors.flowmachine_errors import ( + QueryResetFailedException, + QueryErroredException, +) +from flowmachine.core.preflight import resolve_hooks, Preflight from flowmachine.core.query_state import QueryStateMachine from abc import ABCMeta, abstractmethod @@ -55,7 +59,7 @@ MAX_POSTGRES_NAME_LENGTH = 63 -class Query(metaclass=ABCMeta): +class Query(Preflight, metaclass=ABCMeta): """ The core base class of the flowmachine module. This should handle all input and output methods for our sql queries, so that diff --git a/flowmachine/flowmachine/core/server/query_schemas/base_exposed_query.py b/flowmachine/flowmachine/core/server/query_schemas/base_exposed_query.py index 27117cfd3d..7ed280d5eb 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/base_exposed_query.py +++ b/flowmachine/flowmachine/core/server/query_schemas/base_exposed_query.py @@ -52,6 +52,7 @@ def store_async(self, store_dependencies=True): Query ID that can be used to check the query state. """ q = self._flowmachine_query_obj + q.preflight() q.store(store_dependencies=store_dependencies) query_id = q.query_id diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index a33a5c16a6..9b339736c3 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -12,7 +12,7 @@ from flowmachine.utils import get_name_and_alias from flowmachine.core.errors import InvalidSpatialUnitError -from flowmachine.core import Query, Table +from flowmachine.core import Query, Table, preflight from flowmachine.core.context import get_db from flowmachine.core.grid import Grid @@ -306,7 +306,7 @@ def __init__( if geom_table is None: # Creating a Table object here means that we don't have to handle # tables and Query objects differently in _make_query and get_geom_query - self.geom_table = Table(name=get_db().location_table) + self.geom_table = Table(name="infrastructure.cells") elif isinstance(geom_table, Query): self.geom_table = geom_table else: @@ -702,8 +702,6 @@ class VersionedCellSpatialUnit(LonLatSpatialUnit): """ def __init__(self) -> None: - if get_db().location_table != "infrastructure.cells": - raise InvalidSpatialUnitError("Versioned cell spatial unit is unavailable.") super().__init__( geom_table_column_names=["version"], @@ -711,6 +709,11 @@ def __init__(self) -> None: geom_table="infrastructure.cells", ) + @preflight + def check_cells_available(self): + if get_db().location_table != "infrastructure.cells": + raise InvalidSpatialUnitError("Versioned cell spatial unit is unavailable.") + @property def canonical_name(self) -> str: return "versioned-cell" diff --git a/flowmachine/flowmachine/core/sqlalchemy_utils.py b/flowmachine/flowmachine/core/sqlalchemy_utils.py index f5cf455d4c..6f0991945c 100644 --- a/flowmachine/flowmachine/core/sqlalchemy_utils.py +++ b/flowmachine/flowmachine/core/sqlalchemy_utils.py @@ -56,7 +56,7 @@ def get_sql_string(sqlalchemy_query): return sql -def get_string_representation(sqlalchemy_expr, engine=None): +def get_string_representation(sqlalchemy_expr): """ Return a string containing a SQL fragment which is compiled from the given sqlalchemy expression. @@ -66,7 +66,11 @@ def get_string_representation(sqlalchemy_expr, engine=None): String representation of the sqlalchemy expression. """ # assert isinstance(sqlalchemy_expr, ColumnElement) - return str(sqlalchemy_expr.compile(engine, compile_kwargs={"literal_binds": True})) + return str( + sqlalchemy_expr.compile( + dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True} + ) + ) def get_query_result_as_dataframe(query, *, engine): diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 89d8d0f697..6a88db3d5f 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -84,40 +84,42 @@ def __init__( if not self.is_stored: raise ValueError("{} is not a known table.".format(self.fqn)) + # Record provided columns to ensure that query_id differs with different columns + if isinstance(columns, str): # Wrap strings in a list + columns = [columns] + self.columns = columns + super().__init__() + + def preflight(self): # Get actual columns of this table from the database db_columns = list( zip( *get_db().fetch( f"""SELECT column_name from INFORMATION_SCHEMA.COLUMNS - WHERE table_name = '{self.name}' AND table_schema='{self.schema}'""" + WHERE table_name = '{self.name}' AND table_schema='{self.schema}'""" ) ) )[0] if ( - columns is None or columns == [] + self.columns is None or self.columns == [] ): # No columns specified, setting them from the database columns = db_columns else: self.parent_table = Table( schema=self.schema, name=self.name ) # Point to the full table - if isinstance(columns, str): # Wrap strings in a list - columns = [columns] + logger.debug( "Checking provided columns against db columns.", provided=columns, db_columns=db_columns, ) - if not set(columns).issubset(db_columns): + if not set(self.columns).issubset(db_columns): raise ValueError( "{} are not columns of {}".format( - set(columns).difference(db_columns), self.fqn + set(self.columns).difference(db_columns), self.fqn ) ) - - # Record provided columns to ensure that query_id differs with different columns - self.columns = columns - super().__init__() # Table is immediately in a 'finished executing' state q_state_machine = QueryStateMachine( get_redis(), self.query_id, get_db().conn_id diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index c40853f0e0..06bbf1f3ce 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -77,12 +77,8 @@ def __init__( subscriber_subset=None, subscriber_identifier="msisdn", ): - self.start = standardise_date( - get_db().min_date(table=table) if start is None else start - ) - self.stop = standardise_date( - get_db().max_date(table=table) if stop is None else stop - ) + self.start = standardise_date(start) + self.stop = standardise_date(stop) self.table = table if isinstance(self.table, str): diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index 5ab3f8d723..77f3b74c5a 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -150,8 +150,13 @@ def __init__( super().__init__() + def preflight(self): # This needs to happen after the parent classes init method has been # called as it relies upon the connection object existing + self.sqlalchemy_table = get_sqlalchemy_table_definition( + self.table_ORIG.fully_qualified_table_name, + engine=get_db().engine, + ) self._check_dates() @property diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index 8538a30e46..36bd07e26e 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -83,8 +83,8 @@ def column_names(self) -> List[str]: def _parse_tables(self, tables): if tables is None: - return [f"events.{t}" for t in get_db().subscriber_tables] - elif isinstance(tables, str) and len(tables) > 0: + return [f"events.{t}" for t in ("sms", "calls", "mds", "topups")] + elif isinstance(tables, str): return [tables] elif isinstance(tables, str): raise ValueError("Empty table name.") From 7978db51c750bb30d8d5bf54d6e37986b072af82 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 28 Sep 2020 14:05:08 +0100 Subject: [PATCH 02/89] Fixup --- flowmachine/flowmachine/core/spatial_unit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 9b339736c3..e648bcf70d 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -12,7 +12,8 @@ from flowmachine.utils import get_name_and_alias from flowmachine.core.errors import InvalidSpatialUnitError -from flowmachine.core import Query, Table, preflight +from flowmachine.core import Query, Table +from flowmachine.core.preflight import pre_flight from flowmachine.core.context import get_db from flowmachine.core.grid import Grid @@ -709,7 +710,7 @@ def __init__(self) -> None: geom_table="infrastructure.cells", ) - @preflight + @pre_flight def check_cells_available(self): if get_db().location_table != "infrastructure.cells": raise InvalidSpatialUnitError("Versioned cell spatial unit is unavailable.") From 00767065099481fa29bf9fec9be9b32e95152970 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 28 Sep 2020 16:33:26 +0100 Subject: [PATCH 03/89] Run preflight in get_query --- flowmachine/flowmachine/core/query.py | 1 + flowmachine/flowmachine/core/table.py | 1 + 2 files changed, 2 insertions(+) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index ff3cd89abb..b3dd070228 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -263,6 +263,7 @@ def get_query(self): SQL query string. """ + self.preflight() try: table_name = self.fully_qualified_table_name schema, name = table_name.split(".") diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 6a88db3d5f..c9052a0385 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -146,6 +146,7 @@ def _make_query(self): return "SELECT {cols} FROM {fqn}".format(fqn=self.fqn, cols=cols) def get_query(self): + self.preflight() with get_db().engine.begin() as trans: trans.exec_driver_sql( f"UPDATE cache.cached SET last_accessed = NOW(), access_count = access_count + 1 WHERE query_id ='{self.query_id}'" From fd1d05c96ac4f0a8252c768367b654bdc0a8e508 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 28 Sep 2020 16:49:58 +0100 Subject: [PATCH 04/89] Set columns post-init where all are required --- flowmachine/flowmachine/core/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index c9052a0385..801ae9df25 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -103,7 +103,7 @@ def preflight(self): if ( self.columns is None or self.columns == [] ): # No columns specified, setting them from the database - columns = db_columns + self.columns = db_columns else: self.parent_table = Table( schema=self.schema, name=self.name From 1428e83bd86833430364a0196b7983f827fee96c Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 28 Sep 2020 17:04:55 +0100 Subject: [PATCH 05/89] Need the class for mro --- flowmachine/flowmachine/core/preflight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py index 037d758c61..569e7e5a8a 100644 --- a/flowmachine/flowmachine/core/preflight.py +++ b/flowmachine/flowmachine/core/preflight.py @@ -61,7 +61,7 @@ class Preflight: def preflight(self): errors = [] for dependency in [*self.dependencies, self]: - for hook in resolve_hooks(dependency)["preflight"]: + for hook in resolve_hooks(dependency.__class__)["preflight"]: try: hook() except Exception as e: From 370a92ce6ff2f1fa8eba7e101a7dda1ce3996182 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 29 Sep 2020 11:44:34 +0100 Subject: [PATCH 06/89] Fix some typos, and call hooks in a sensible order --- flowmachine/flowmachine/core/preflight.py | 32 ++++++++++++------- flowmachine/flowmachine/core/table.py | 21 ++++++++---- .../features/subscriber/contact_balance.py | 7 ++-- .../features/utilities/event_table_subset.py | 6 ++-- 4 files changed, 45 insertions(+), 21 deletions(-) diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py index 569e7e5a8a..303fa64be5 100644 --- a/flowmachine/flowmachine/core/preflight.py +++ b/flowmachine/flowmachine/core/preflight.py @@ -4,18 +4,20 @@ import typing +import networkx as nx + +from flowmachine.core.dependency_graph import ( + calculate_dependency_graph, + get_dependency_links, + _assemble_dependency_graph, +) from flowmachine.core.errors.flowmachine_errors import QueryErroredException def pre_flight(method): method.__hooks__ = getattr(method, "__hooks__", {}) method.__hooks__["pre_flight"] = method - - @wraps(method) - def _impl(self): - return method(self) - - return _impl + return method def resolve_hooks(cls) -> typing.Dict[str, typing.List[typing.Callable]]: @@ -45,7 +47,7 @@ def resolve_hooks(cls) -> typing.Dict[str, typing.List[typing.Callable]]: continue try: - hook_config = attr.__hook__ + hook_config = attr.__hooks__ except AttributeError: pass else: @@ -60,12 +62,20 @@ def resolve_hooks(cls) -> typing.Dict[str, typing.List[typing.Callable]]: class Preflight: def preflight(self): errors = [] - for dependency in [*self.dependencies, self]: - for hook in resolve_hooks(dependency.__class__)["preflight"]: + dep_graph = _assemble_dependency_graph( + dependencies=get_dependency_links(self), + attrs_func=lambda x: dict(query=x), + ) + deps = [dep_graph.nodes[id]["query"] for id in nx.topological_sort(dep_graph)][ + ::-1 + ] + + for dependency in deps: + for hook in resolve_hooks(dependency.__class__)["pre_flight"]: try: - hook() + getattr(dependency, hook)() except Exception as e: - errors.append(e) + errors.append((dependency, e)) if len(errors) > 0: raise QueryErroredException( f"Pre-flight failed for '{self.query_id}'. Errors: {errors}" diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 801ae9df25..9262a5696c 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -12,6 +12,7 @@ from flowmachine.core.query_state import QueryStateMachine from .context import get_db, get_redis from .errors import NotConnectedError +from .preflight import pre_flight from .query import Query from .subset import subset_factory from .cache import write_cache_metadata @@ -81,16 +82,24 @@ def __init__( self.fqn = "{}.{}".format(schema, name) if schema else name if "." not in self.fqn: raise ValueError("{} is not a valid table.".format(self.fqn)) - if not self.is_stored: - raise ValueError("{} is not a known table.".format(self.fqn)) # Record provided columns to ensure that query_id differs with different columns if isinstance(columns, str): # Wrap strings in a list columns = [columns] self.columns = columns + if self.columns is not None and len(self.columns) > 0: + self.parent_table = Table( + schema=self.schema, name=self.name + ) # Point to the full table super().__init__() - def preflight(self): + @pre_flight + def check_exists(self): + if not self.is_stored: + raise ValueError("{} is not a known table.".format(self.fqn)) + + @pre_flight + def check_columns(self): # Get actual columns of this table from the database db_columns = list( zip( @@ -105,9 +114,6 @@ def preflight(self): ): # No columns specified, setting them from the database self.columns = db_columns else: - self.parent_table = Table( - schema=self.schema, name=self.name - ) # Point to the full table logger.debug( "Checking provided columns against db columns.", @@ -120,6 +126,9 @@ def preflight(self): set(self.columns).difference(db_columns), self.fqn ) ) + + @pre_flight + def ff_state_machine(self): # Table is immediately in a 'finished executing' state q_state_machine = QueryStateMachine( get_redis(), self.query_id, get_db().conn_id diff --git a/flowmachine/flowmachine/features/subscriber/contact_balance.py b/flowmachine/flowmachine/features/subscriber/contact_balance.py index 693b9c5d9a..5be273ded1 100644 --- a/flowmachine/flowmachine/features/subscriber/contact_balance.py +++ b/flowmachine/flowmachine/features/subscriber/contact_balance.py @@ -76,14 +76,17 @@ def __init__( exclude_self_calls=True, subscriber_subset=None, ): - self.tables = tables + self.tables = ( + ["events.calls", "events.sms"] + if tables.lower() == "all" or tables is None + else tables + ) self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.direction = Direction(direction) self.subscriber_identifier = subscriber_identifier self.exclude_self_calls = exclude_self_calls - self.tables = tables column_list = [ self.subscriber_identifier, diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index 77f3b74c5a..5aa8dca125 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -12,6 +12,7 @@ from ...core import Query, Table from ...core.context import get_db from ...core.errors import MissingDateError +from ...core.preflight import pre_flight from ...core.sqlalchemy_utils import ( get_sqlalchemy_table_definition, make_sqlalchemy_column_from_flowmachine_column_description, @@ -150,19 +151,20 @@ def __init__( super().__init__() - def preflight(self): + @pre_flight + def create_table_def(self): # This needs to happen after the parent classes init method has been # called as it relies upon the connection object existing self.sqlalchemy_table = get_sqlalchemy_table_definition( self.table_ORIG.fully_qualified_table_name, engine=get_db().engine, ) - self._check_dates() @property def column_names(self) -> List[str]: return [c.split(" AS ")[-1] for c in self.columns] + @pre_flight def _check_dates(self): # Handle the logic for dealing with missing dates. # If there are no dates present, then we raise an error From aa09330b46d3a7b0d8714cdc3e6136fcf18c3946 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 29 Sep 2020 15:11:19 +0100 Subject: [PATCH 07/89] Limit the default tables --- .../flowmachine/features/utilities/event_table_subset.py | 4 +++- .../flowmachine/features/utilities/events_tables_union.py | 6 ++++-- flowmachine/tests/conftest.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index 5aa8dca125..4c5cb9ae91 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -153,6 +153,7 @@ def __init__( @pre_flight def create_table_def(self): + logger.debug("Creating sql alchemy table.") # This needs to happen after the parent classes init method has been # called as it relies upon the connection object existing self.sqlalchemy_table = get_sqlalchemy_table_definition( @@ -165,7 +166,8 @@ def column_names(self) -> List[str]: return [c.split(" AS ")[-1] for c in self.columns] @pre_flight - def _check_dates(self): + def check_dates(self): + logger.debug("Checking dates are valid.") # Handle the logic for dealing with missing dates. # If there are no dates present, then we raise an error # if some are present, but some are missing we raise a diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index 36bd07e26e..83f728cc33 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -65,7 +65,7 @@ def __init__( self.tables = self._parse_tables(tables) if "*" in columns and len(self.tables) != 1: raise ValueError( - "Must give named tables when combining multiple event type tables." + "Must give named columns when combining multiple event type tables." ) self.date_subsets = self._make_table_list( hours=hours, @@ -83,7 +83,9 @@ def column_names(self) -> List[str]: def _parse_tables(self, tables): if tables is None: - return [f"events.{t}" for t in ("sms", "calls", "mds", "topups")] + return [ + f"events.{t}" for t in ("sms", "calls") + ] # This should default to all the tables really, but that would break all the tests elif isinstance(tables, str): return [tables] elif isinstance(tables, str): diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 103461aed5..675d5013c5 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -149,7 +149,7 @@ def skip_datecheck(request, monkeypatch): """ run_date_checks = request.node.get_closest_marker("check_available_dates", False) if not run_date_checks: - monkeypatch.setattr(EventTableSubset, "_check_dates", lambda x: True) + monkeypatch.setattr(EventTableSubset, "check_dates", lambda x: True) @pytest.fixture(autouse=True) From 4adaeac2da31487812e6ca5c8cc5c56df62f55a6 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 29 Sep 2020 15:54:54 +0100 Subject: [PATCH 08/89] Pass column names when creating table from query --- flowmachine/flowmachine/core/query.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index b3dd070228..cb84658cc2 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -387,7 +387,9 @@ def get_table(self): flowmachine.core.Table The stored version of this Query as a Table object """ - return flowmachine.core.Table(self.fully_qualified_table_name) + return flowmachine.core.Table( + self.fully_qualified_table_name, columns=self.column_names + ) def union(self, *other: "Query", all: bool = True): """ From 3056640d480ecf8a5acc5ac1879e96ecd932ce08 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 29 Sep 2020 15:58:47 +0100 Subject: [PATCH 09/89] Ensure preflight is called before getting column names in head --- flowmachine/flowmachine/core/query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index cb84658cc2..ae2223dab0 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -371,7 +371,8 @@ def head(self, n=5): try: return self._df.head(n) except AttributeError: - Q = f"SELECT {self.column_names_as_string_list} FROM ({self.get_query()}) h LIMIT {n};" + q_string = self.get_query() + Q = f"SELECT {self.column_names_as_string_list} FROM ({q_string}) h LIMIT {n};" con = get_db().engine with con.begin() as trans: df = pd.read_sql_query(Q, con=trans) From 3446e70e3c027492d7c538bfe7abdf11f040a33b Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 29 Sep 2020 16:13:12 +0100 Subject: [PATCH 10/89] Move geom column check to preflight --- flowmachine/flowmachine/core/geotable.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/flowmachine/flowmachine/core/geotable.py b/flowmachine/flowmachine/core/geotable.py index 7e4f227f01..e9714ec2b1 100644 --- a/flowmachine/flowmachine/core/geotable.py +++ b/flowmachine/flowmachine/core/geotable.py @@ -9,6 +9,7 @@ from . import Table from .mixins import GeoDataMixin +from .preflight import pre_flight class GeoTable(GeoDataMixin, Table): @@ -52,13 +53,18 @@ def __init__( self.geom_column = geom_column self.gid_column = gid_column super().__init__(name=name, schema=schema, columns=columns) - if geom_column not in self.column_names: + + @pre_flight + def check_geom_column_present(self): + if self.geom_column not in self.column_names: raise ValueError( - "geom_column: {} is not a column in this table.".format(geom_column) + "geom_column: {} is not a column in this table.".format( + self.geom_column + ) ) - if gid_column is not None and gid_column not in self.column_names: + if self.gid_column is not None and self.gid_column not in self.column_names: raise ValueError( - "gid_column: {} is not a column in this table.".format(gid_column) + "gid_column: {} is not a column in this table.".format(self.gid_column) ) def _geo_augmented_query(self): From 4d557d88a679044b0d86c9c5464ee6cda54857df Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 2 Oct 2020 11:50:50 +0100 Subject: [PATCH 11/89] Protect all tables, drop events prefix --- flowmachine/flowmachine/core/cache.py | 6 +- flowmachine/flowmachine/core/events_table.py | 120 ++++++++++++++++++ flowmachine/flowmachine/core/flowdb_table.py | 13 ++ .../flowmachine/core/infrastructure_table.py | 120 ++++++++++++++++++ flowmachine/flowmachine/core/preflight.py | 10 +- flowmachine/flowmachine/core/query.py | 9 +- .../server/query_schemas/custom_fields.py | 7 - flowmachine/flowmachine/core/spatial_unit.py | 29 +++-- flowmachine/flowmachine/core/table.py | 42 +++--- .../features/network/total_network_objects.py | 2 - .../spatial/versioned_infrastructure.py | 3 +- .../features/subscriber/contact_balance.py | 4 +- .../features/subscriber/mds_volume.py | 2 +- .../subscriber/subscriber_call_durations.py | 8 +- .../features/subscriber/subscriber_tacs.py | 6 +- .../features/subscriber/topup_amount.py | 2 +- .../features/subscriber/topup_balance.py | 2 +- .../features/utilities/event_table_subset.py | 14 +- .../features/utilities/events_tables_union.py | 7 +- flowmachine/tests/conftest.py | 6 +- .../test_sql_strings_and_results.py | 8 +- flowmachine/tests/test_cache_utils.py | 1 + flowmachine/tests/test_daily_location.py | 2 +- flowmachine/tests/test_event_table_subset.py | 6 +- flowmachine/tests/test_events_table_union.py | 10 +- flowmachine/tests/test_geotable.py | 7 +- flowmachine/tests/test_join_to_location.py | 9 +- flowmachine/tests/test_query_union.py | 8 +- flowmachine/tests/test_random.py | 2 +- flowmachine/tests/test_raster_statistics.py | 8 +- .../tests/test_redacted_total_events.py | 4 +- .../test_redacted_unique_subscriber_counts.py | 2 +- flowmachine/tests/test_spatial_unit.py | 15 ++- .../tests/test_subscriber_call_durations.py | 9 +- flowmachine/tests/test_subscriber_degree.py | 12 +- .../tests/test_subscriber_event_count.py | 14 +- .../test_subscriber_event_type_proportion.py | 22 ++-- .../tests/test_subscriber_location_cluster.py | 5 +- .../tests/test_subscriber_location_subset.py | 8 +- .../tests/test_subscriber_locations.py | 6 +- .../tests/test_subscriber_subsetting.py | 8 +- flowmachine/tests/test_subsetting.py | 6 +- flowmachine/tests/test_table.py | 48 ++++--- .../tests/test_total_location_events.py | 2 +- 44 files changed, 454 insertions(+), 180 deletions(-) create mode 100644 flowmachine/flowmachine/core/events_table.py create mode 100644 flowmachine/flowmachine/core/flowdb_table.py create mode 100644 flowmachine/flowmachine/core/infrastructure_table.py diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index cecd13aa0a..7fa5d0dd4d 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -29,6 +29,7 @@ from flowmachine.core.query_state import QueryStateMachine, QueryEvent from flowmachine import __version__ + if TYPE_CHECKING: from .query import Query from .connection import Connection @@ -495,6 +496,9 @@ def get_cached_query_objects_ordered_by_score( Returns a list of cached Query objects with their on disk sizes """ + from flowmachine.core.events_table import events_table_map + from flowmachine.core.infrastructure_table import infrastructure_table_map + protected_period_clause = ( (f" AND NOW()-created > INTERVAL '{protected_period} seconds'") if protected_period is not None @@ -502,7 +506,7 @@ def get_cached_query_objects_ordered_by_score( ) qry = f"""SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE cached.class!='Table' AND cached.class!='GeoTable' + WHERE NOT (cached.class=ANY(ARRAY{['Table', 'GeoTable', *[cls.__name__ for cls in events_table_map.values()], *[cls.__name__ for cls in infrastructure_table_map.values()]]})) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ diff --git a/flowmachine/flowmachine/core/events_table.py b/flowmachine/flowmachine/core/events_table.py new file mode 100644 index 0000000000..bd712b8450 --- /dev/null +++ b/flowmachine/flowmachine/core/events_table.py @@ -0,0 +1,120 @@ +from flowmachine.core.flowdb_table import FlowDBTable + + +class EventsTable(FlowDBTable): + def __init__(self, *, name, columns): + super().__init__(schema="events", name=name, columns=columns) + + +class CallsTable(EventsTable): + all_columns = [ + "id", + "outgoing", + "datetime", + "duration", + "network", + "msisdn", + "msisdn_counterpart", + "location_id", + "imsi", + "imei", + "tac", + "operator_code", + "country_code", + ] + + def __init__(self, *, columns=None): + super().__init__(name="calls", columns=columns) + + +class ForwardsTable(EventsTable): + all_columns = [ + "id", + "outgoing", + "datetime", + "network", + "msisdn", + "msisdn_counterpart", + "location_id", + "imsi", + "imei", + "tac", + "operator_code", + "country_code", + ] + + def __init__(self, *, columns=None): + super().__init__(name="forwards", columns=columns) + + +class SmsTable(EventsTable): + all_columns = [ + "id", + "outgoing", + "datetime", + "network", + "msisdn", + "msisdn_counterpart", + "location_id", + "imsi", + "imei", + "tac", + "operator_code", + "country_code", + ] + + def __init__(self, *, columns): + super().__init__(name="sms", columns=columns) + + +class MdsTable(EventsTable): + all_columns = [ + "id", + "datetime", + "duration", + "volume_total", + "volume_upload", + "volume_download", + "msisdn", + "location_id", + "imsi", + "imei", + "tac", + "operator_code", + "country_code", + ] + + def __init__(self, *, columns): + super().__init__(name="mds", columns=columns) + + +class TopupsTable(EventsTable): + all_columns = [ + "id", + "datetime", + "type", + "recharge_amount", + "airtime_fee", + "tax_and_fee", + "pre_event_balance", + "post_event_balance", + "msisdn", + "location_id", + "imsi", + "imei", + "tac", + "opera" "tor_code", + "country_code", + ] + + def __init__(self, *, columns): + super().__init__(name="topups", columns=columns) + + +events_table_map = dict( + calls=CallsTable, + sms=SmsTable, + mds=MdsTable, + topups=TopupsTable, + forwards=ForwardsTable, +) diff --git a/flowmachine/flowmachine/core/flowdb_table.py b/flowmachine/flowmachine/core/flowdb_table.py new file mode 100644 index 0000000000..fded845ab3 --- /dev/null +++ b/flowmachine/flowmachine/core/flowdb_table.py @@ -0,0 +1,13 @@ +from flowmachine.core.table import Table + + +class FlowDBTable(Table): + def __init__(self, *, name, schema, columns): + if columns is None: + columns = self.all_columns + if set(columns).issubset(self.all_columns): + super().__init__(schema=schema, name=name, columns=columns) + else: + raise ValueError( + f"Columns {columns} must be a subset of {self.all_columns}" + ) diff --git a/flowmachine/flowmachine/core/infrastructure_table.py b/flowmachine/flowmachine/core/infrastructure_table.py new file mode 100644 index 0000000000..5501655255 --- /dev/null +++ b/flowmachine/flowmachine/core/infrastructure_table.py @@ -0,0 +1,120 @@ +from flowmachine.core.flowdb_table import FlowDBTable + + +class InfrastructureTable(FlowDBTable): + def __init__(self, *, name, columns): + super().__init__(schema="infrastructure", name=name, columns=columns) + + +class SitesTable(InfrastructureTable): + all_columns = [ + "site_id", + "id", + "version", + "name", + "type", + "status", + "structure_type", + "is_cow", + "date_of_first_service", + "date_of_last_service", + "geom_point", + "geom_polygon", + ] + + def __init__(self, *, columns=None): + super().__init__(name="sites", columns=columns) + + +class CellsTable(InfrastructureTable): + all_columns = [ + "cell_id", + "id", + "version", + "site_id", + "name", + "type", + "msc", + "bsc_rnc", + "antenna_type", + "status", + "lac", + "height", + "azimuth", + "transmitter", + "max_range", + "min_range", + "electrical_tilt", + "mechanical_downtilt", + "date_of_f" "irst_service", + "date_of_last_service", + "geom_point", + "geom_polygon", + ] + + def __init__(self, *, columns=None): + super().__init__(name="cells", columns=columns) + + +class TacsTable(InfrastructureTable): + all_columns = [ + "id", + "brand", + "model", + "width", + "height", + "depth", + "weight", + "display_type", + "display_colors", + "display_width", + "display_height", + "mms_receiver", + "mms_built_in_camera", + "wap_push_ota_support", + "hardware_gprs", + "hardw" "are_edge", + "hardware_umts", + "hardware_wifi", + "hardware_bluetooth", + "hardware_gps", + "software_os_vendor", + "software_os_name", + "software_os_version", + "wap_push_ota_settings", + "wap_push_ota_bookmarks", + "wap_push_ota" "_app_internet", + "wap_push_ota_app_browser", + "wap_push_ota_app_mms", + "wap_push_ota_single_shot", + "wap_push_ota_multi_shot", + "wap_push_oma_settings", + "wap_push_oma_app_internet", + "wap_push_oma_app_browser", + "wap_" "push_oma_app_mms", + "wap_push_oma_cp_bookmarks", + "wap_1_2_1", + "wap_2_0", + "syncml_dm_settings", + "syncml_dm_acc_gprs", + "syncml_dm_app_internet", + "syncml_dm_app_browser", + "syncml_dm_app_mms", + "syncml_dm_app_bookmark", + "syncml_dm_app_java", + "wap_push_oma_app_ims", + "wap_push_oma_app_poc", + "j2me_midp_10", + "j2me_midp_20", + "j2me_midp_21", + "j2me_cldc_10", + "j2me_cldc_11", + "j2me_cldc_20", + "hnd_type", + ] + + def __init__(self, *, columns=None): + super().__init__(name="tacs", columns=columns) + + +infrastructure_table_map = dict(tacs=TacsTable, cells=CellsTable, sites=SitesTable) diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py index 303fa64be5..c994133222 100644 --- a/flowmachine/flowmachine/core/preflight.py +++ b/flowmachine/flowmachine/core/preflight.py @@ -5,6 +5,7 @@ import typing import networkx as nx +import structlog from flowmachine.core.dependency_graph import ( calculate_dependency_graph, @@ -13,6 +14,8 @@ ) from flowmachine.core.errors.flowmachine_errors import QueryErroredException +logger = structlog.get_logger("flowmachine.debug", submodule=__name__) + def pre_flight(method): method.__hooks__ = getattr(method, "__hooks__", {}) @@ -61,7 +64,7 @@ def resolve_hooks(cls) -> typing.Dict[str, typing.List[typing.Callable]]: class Preflight: def preflight(self): - errors = [] + errors = dict() dep_graph = _assemble_dependency_graph( dependencies=get_dependency_links(self), attrs_func=lambda x: dict(query=x), @@ -75,8 +78,11 @@ def preflight(self): try: getattr(dependency, hook)() except Exception as e: - errors.append((dependency, e)) + errors.setdefault(dependency.query_id, []).append(e) if len(errors) > 0: + logger.debug( + "Pre-flight failed.", query=self, query_id=self.query_id, errors=errors + ) raise QueryErroredException( f"Pre-flight failed for '{self.query_id}'. Errors: {errors}" ) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index ae2223dab0..7b7fb8c573 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -388,9 +388,12 @@ def get_table(self): flowmachine.core.Table The stored version of this Query as a Table object """ - return flowmachine.core.Table( - self.fully_qualified_table_name, columns=self.column_names - ) + if self.is_stored: + return flowmachine.core.Table( + self.fully_qualified_table_name, columns=self.column_names + ) + else: + raise ValueError(f"{self} not stored on this connection.") def union(self, *other: "Query", all: bool = True): """ diff --git a/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py b/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py index 0641895f54..5bda8ba328 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py +++ b/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py @@ -77,13 +77,6 @@ def __init__( **kwargs, ) - def _deserialize(self, value, attr, data, **kwargs): - # Temporary workaround for https://github.com/Flowminder/FlowKit/issues/1015 until underlying issue resolved - return [ - f"events.{event_type}" - for event_type in set(super()._deserialize(value, attr, data, **kwargs)) - ] - class TotalBy(fields.String): """ diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index e648bcf70d..e2103b943f 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -20,6 +20,7 @@ # TODO: Currently most spatial units require a FlowDB connection at init time. # It would be useful to remove this requirement wherever possible, and instead # implement a method to check whether the required data can be found in the DB. +from .infrastructure_table import infrastructure_table_map def _substitute_lat_lon(location_dict): @@ -279,7 +280,7 @@ def __init__( *, geom_table_column_names: Union[str, Iterable[str]], location_id_column_names: Union[str, Iterable[str]], - geom_table: Optional[Union[Query, str]] = None, + geom_table: Optional[Query] = None, mapping_table: Optional[Union[Query, str]] = None, geom_column: str = "geom", geom_table_join_on: Optional[str] = None, @@ -307,11 +308,11 @@ def __init__( if geom_table is None: # Creating a Table object here means that we don't have to handle # tables and Query objects differently in _make_query and get_geom_query - self.geom_table = Table(name="infrastructure.cells") + self.geom_table = infrastructure_table_map["cells"]() elif isinstance(geom_table, Query): self.geom_table = geom_table else: - self.geom_table = Table(name=geom_table) + raise TypeError("geom_table must be a Query or Table object.") if mapping_table is not None: # Creating a Table object here means that we don't have to handle @@ -319,7 +320,10 @@ def __init__( if isinstance(mapping_table, Query): self.mapping_table = mapping_table else: - self.mapping_table = Table(name=mapping_table) + self.mapping_table = Table( + name=mapping_table, + columns=[location_table_join_on, geom_table_join_on], + ) if location_table_join_on not in self.mapping_table.column_names: raise ValueError( @@ -657,8 +661,8 @@ def __init__( self, *, geom_table_column_names: Union[str, Iterable[str]], - geom_table: Union[Query, str], - mapping_table: Optional[Union[Query, str]] = None, + geom_table: Query, + mapping_table: Optional[Query] = None, geom_column: str = "geom", geom_table_join_on: Optional[str] = None, ): @@ -707,7 +711,7 @@ def __init__(self) -> None: super().__init__( geom_table_column_names=["version"], location_id_column_names=["location_id", "version"], - geom_table="infrastructure.cells", + geom_table=infrastructure_table_map["cells"](), ) @pre_flight @@ -735,7 +739,7 @@ def __init__(self) -> None: "version", ], location_id_column_names=["site_id", "version"], - geom_table="infrastructure.sites", + geom_table=infrastructure_table_map["sites"](), geom_table_join_on="id", location_table_join_on="site_id", ) @@ -784,12 +788,15 @@ def __init__( col_name = f"admin{level}pcod AS pcod" else: col_name = region_id_column_name - table = f"geography.admin{level}" self.level = level super().__init__( geom_table_column_names=col_name, - geom_table=table, + geom_table=Table( + schema="geography", + name=f"admin{level}", + columns=[f"admin{level}pcod", f"admin{level}name", "geom"], + ), mapping_table=mapping_table, geom_table_join_on=None if mapping_table is None else f"admin{level}pcod", ) @@ -841,7 +848,7 @@ def make_spatial_unit( level: Optional[int] = None, region_id_column_name: Optional[Union[str, Iterable[str]]] = None, size: Union[float, int] = None, - geom_table: Optional[Union[Query, str]] = None, + geom_table: Optional[Query] = None, geom_column: str = "geom", mapping_table: Optional[Union[str, Query]] = None, geom_table_join_on: Optional[str] = None, diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 9262a5696c..4a80584c04 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -10,12 +10,11 @@ from typing import List, Iterable, Optional from flowmachine.core.query_state import QueryStateMachine -from .context import get_db, get_redis -from .errors import NotConnectedError -from .preflight import pre_flight -from .query import Query -from .subset import subset_factory -from .cache import write_cache_metadata +from flowmachine.core.context import get_db, get_redis +from flowmachine.core.preflight import pre_flight +from flowmachine.core.query import Query +from flowmachine.core.subset import subset_factory +from flowmachine.core.cache import write_cache_metadata import structlog @@ -87,10 +86,8 @@ def __init__( if isinstance(columns, str): # Wrap strings in a list columns = [columns] self.columns = columns - if self.columns is not None and len(self.columns) > 0: - self.parent_table = Table( - schema=self.schema, name=self.name - ) # Point to the full table + if self.columns is None or len(self.columns) == 0: + raise ValueError("No columns requested.") super().__init__() @pre_flight @@ -109,23 +106,16 @@ def check_columns(self): ) ) )[0] - if ( - self.columns is None or self.columns == [] - ): # No columns specified, setting them from the database - self.columns = db_columns - else: - - logger.debug( - "Checking provided columns against db columns.", - provided=columns, - db_columns=db_columns, + + logger.debug( + "Checking provided columns against db columns.", + provided=columns, + db_columns=db_columns, + ) + if not set(self.columns).issubset(db_columns): + raise ValueError( + f"{set(self.columns).difference(db_columns)} are not columns of {self.fqn}" ) - if not set(self.columns).issubset(db_columns): - raise ValueError( - "{} are not columns of {}".format( - set(self.columns).difference(db_columns), self.fqn - ) - ) @pre_flight def ff_state_machine(self): diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index 06bbf1f3ce..a4aba4c38c 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -83,8 +83,6 @@ def __init__( self.table = table if isinstance(self.table, str): self.table = self.table.lower() - if self.table != "all" and not self.table.startswith("events"): - self.table = "events.{}".format(self.table) network_object.verify_criterion("is_network_object") self.network_object = network_object diff --git a/flowmachine/flowmachine/features/spatial/versioned_infrastructure.py b/flowmachine/flowmachine/features/spatial/versioned_infrastructure.py index d9318296f7..12f5222df2 100644 --- a/flowmachine/flowmachine/features/spatial/versioned_infrastructure.py +++ b/flowmachine/flowmachine/features/spatial/versioned_infrastructure.py @@ -14,6 +14,7 @@ from datetime import datetime from flowmachine.core import Table +from ...core.infrastructure_table import infrastructure_table_map from ...core.query import Query @@ -62,7 +63,7 @@ def __init__(self, table="sites", date=None): if date == None: date = datetime.now().strftime("%Y-%m-%d") - self.table = Table(schema="infrastructure", name=table) + self.table = infrastructure_table_map[table]() self.date = date super().__init__() diff --git a/flowmachine/flowmachine/features/subscriber/contact_balance.py b/flowmachine/flowmachine/features/subscriber/contact_balance.py index 5be273ded1..1a6ca5f44e 100644 --- a/flowmachine/flowmachine/features/subscriber/contact_balance.py +++ b/flowmachine/flowmachine/features/subscriber/contact_balance.py @@ -77,9 +77,7 @@ def __init__( subscriber_subset=None, ): self.tables = ( - ["events.calls", "events.sms"] - if tables.lower() == "all" or tables is None - else tables + ["calls", "sms"] if tables.lower() == "all" or tables is None else tables ) self.start = standardise_date(start) self.stop = standardise_date(stop) diff --git a/flowmachine/flowmachine/features/subscriber/mds_volume.py b/flowmachine/flowmachine/features/subscriber/mds_volume.py index 8102eaae0a..df11637446 100644 --- a/flowmachine/flowmachine/features/subscriber/mds_volume.py +++ b/flowmachine/flowmachine/features/subscriber/mds_volume.py @@ -67,7 +67,7 @@ def __init__( self.hours = hours self.volume = volume self.statistic = Statistic(statistic.lower()) - self.tables = "events.mds" + self.tables = "mds" if self.volume not in {"total", "upload", "download"}: raise ValueError(f"{self.volume} is not a valid volume.") diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py b/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py index 825dca9ec7..7d6176149b 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py @@ -86,7 +86,7 @@ def __init__( self.unioned_query = EventsTablesUnion( self.start, self.stop, - tables="events.calls", + tables="calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, @@ -185,7 +185,7 @@ def __init__( EventsTablesUnion( self.start, self.stop, - tables="events.calls", + tables="calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, @@ -275,7 +275,7 @@ def __init__( self.unioned_query = EventsTablesUnion( self.start, self.stop, - tables="events.calls", + tables="calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, @@ -376,7 +376,7 @@ def __init__( EventsTablesUnion( self.start, self.stop, - tables="events.calls", + tables="calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_tacs.py b/flowmachine/flowmachine/features/subscriber/subscriber_tacs.py index 09e5787f35..6abf671c48 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_tacs.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_tacs.py @@ -17,6 +17,7 @@ from .metaclasses import SubscriberFeature from ...core import Table from flowmachine.utils import standardise_date +from ...core.infrastructure_table import TacsTable valid_characteristics = { "brand", @@ -312,7 +313,7 @@ def __init__( subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ) - self.tacs = Table("infrastructure.tacs") + self.tacs = TacsTable() self.joined = self.subscriber_tacs.join(self.tacs, "tac", "id", how="left") super().__init__() @@ -393,8 +394,7 @@ def __init__( method=method, subscriber_subset=subscriber_subset, ) - self.method = method - self.tacs = Table("infrastructure.tacs") + self.tacs = TacsTable() self.joined = self.subscriber_tac.join(self.tacs, "tac", "id", how="left") super().__init__() diff --git a/flowmachine/flowmachine/features/subscriber/topup_amount.py b/flowmachine/flowmachine/features/subscriber/topup_amount.py index 074a05e085..1dcbc0750b 100644 --- a/flowmachine/flowmachine/features/subscriber/topup_amount.py +++ b/flowmachine/flowmachine/features/subscriber/topup_amount.py @@ -65,7 +65,7 @@ def __init__( self.subscriber_identifier = subscriber_identifier self.hours = hours self.statistic = Statistic(statistic.lower()) - self.tables = "events.topups" + self.tables = "topups" column_list = [self.subscriber_identifier, "recharge_amount"] diff --git a/flowmachine/flowmachine/features/subscriber/topup_balance.py b/flowmachine/flowmachine/features/subscriber/topup_balance.py index b489871fbe..7a8bbe318a 100644 --- a/flowmachine/flowmachine/features/subscriber/topup_balance.py +++ b/flowmachine/flowmachine/features/subscriber/topup_balance.py @@ -90,7 +90,7 @@ def __init__( self.subscriber_identifier = subscriber_identifier self.hours = hours self.statistic = Statistic(statistic.lower()) - self.tables = "events.topups" + self.tables = "topups" column_list = [ self.subscriber_identifier, diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index 4c5cb9ae91..f24bb5b26d 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -12,6 +12,7 @@ from ...core import Query, Table from ...core.context import get_db from ...core.errors import MissingDateError +from ...core.events_table import events_table_map from ...core.preflight import pre_flight from ...core.sqlalchemy_utils import ( get_sqlalchemy_table_definition, @@ -77,9 +78,9 @@ def __init__( stop=None, hours: Optional[Tuple[int, int]] = None, hour_slices=None, - table="events.calls", + table="calls", subscriber_subset=None, - columns=["*"], + columns=None, subscriber_identifier="msisdn", ): if hours == "all": @@ -122,12 +123,9 @@ def __init__( self.hours = hours self.subscriber_subsetter = make_subscriber_subsetter(subscriber_subset) self.subscriber_identifier = subscriber_identifier.lower() - if columns == ["*"]: - self.table_ORIG = Table(table) - columns = self.table_ORIG.column_names - else: - self.table_ORIG = Table(table, columns=columns) - self.columns = set(columns) + self.table_ORIG = events_table_map[table](columns=columns) + + self.columns = set(self.table_ORIG.column_names) try: self.columns.remove(subscriber_identifier) self.columns.add(f"{subscriber_identifier} AS subscriber") diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index 83f728cc33..2d63805e91 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -83,9 +83,10 @@ def column_names(self) -> List[str]: def _parse_tables(self, tables): if tables is None: - return [ - f"events.{t}" for t in ("sms", "calls") - ] # This should default to all the tables really, but that would break all the tests + return ( + "sms", + "calls", + ) # This should default to all the tables really, but that would break all the tests elif isinstance(tables, str): return [tables] elif isinstance(tables, str): diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 675d5013c5..1bccfd19f1 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -25,7 +25,7 @@ ) import flowmachine -from flowmachine.core import make_spatial_unit +from flowmachine.core import make_spatial_unit, Table from flowmachine.core.cache import reset_cache from flowmachine.core.context import ( redis_connection, @@ -103,12 +103,12 @@ def parse_json(): { "spatial_unit_type": "polygon", "region_id_column_name": "admin3pcod", - "geom_table": "geography.admin3", + "geom_table": Table("geography.admin3", columns=["geom", "admin3pcod"]), }, { "spatial_unit_type": "polygon", "region_id_column_name": "id AS site_id", - "geom_table": "infrastructure.sites", + "geom_table": Table("infrastructure.sites", columns=["geom_point", "id"]), "geom_column": "geom_point", }, ], diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index be52003b5f..c69abb2276 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -210,9 +210,7 @@ def test_daily_location_6_sql(diff_reporter): """, ["subscriber"], ) - dl = daily_location( - "2016-01-03", table="events.calls", subscriber_subset=subset_query - ) + dl = daily_location("2016-01-03", table="calls", subscriber_subset=subset_query) sql = pretty_sql(dl.get_query()) diff_reporter(sql) @@ -229,9 +227,7 @@ def test_daily_location_6_df(get_dataframe, diff_reporter): """, ["outgoing", "datetime", "duration", "subscriber"], ) - dl = daily_location( - "2016-01-03", table="events.calls", subscriber_subset=subset_query - ) + dl = daily_location("2016-01-03", table="calls", subscriber_subset=subset_query) df = get_dataframe(dl) diff_reporter(df.to_csv()) diff --git a/flowmachine/tests/test_cache_utils.py b/flowmachine/tests/test_cache_utils.py index 04eed46190..3b69091be6 100644 --- a/flowmachine/tests/test_cache_utils.py +++ b/flowmachine/tests/test_cache_utils.py @@ -110,6 +110,7 @@ def test_touch_cache_record_for_table(flowmachine_connect): Touching a cache record for a table should update access count and last accessed but not touch score, or counter. """ table = Table("events.calls_20160101") + table.preflight() with get_db().engine.begin() as conn: conn.exec_driver_sql( f"UPDATE cache.cached SET compute_time = 1 WHERE query_id=%(ident)s", diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index 818e64ffb4..be206862d4 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -78,4 +78,4 @@ def test_daily_locs_errors(): """ with pytest.raises(MissingDateError): - daily_location("2016-01-31") + daily_location("2016-01-31").preflight() diff --git a/flowmachine/tests/test_event_table_subset.py b/flowmachine/tests/test_event_table_subset.py index 8bbc60715c..4e1af4b09f 100644 --- a/flowmachine/tests/test_event_table_subset.py +++ b/flowmachine/tests/test_event_table_subset.py @@ -56,9 +56,11 @@ def test_error_on_all_missing(): Date subsetter should error when all dates are missing. """ with pytest.raises(MissingDateError): - EventTableSubset(start="2016-05-01", stop="2016-05-02") + EventTableSubset(start="2016-05-01", stop="2016-05-02").preflight() with pytest.raises(MissingDateError): - EventTableSubset(start="2016-05-01", stop="2016-05-02", table="events.topups") + EventTableSubset( + start="2016-05-01", stop="2016-05-02", table="topups" + ).preflight() def test_handles_mins(get_dataframe): diff --git a/flowmachine/tests/test_events_table_union.py b/flowmachine/tests/test_events_table_union.py index 2680a12279..3296839a4c 100644 --- a/flowmachine/tests/test_events_table_union.py +++ b/flowmachine/tests/test_events_table_union.py @@ -8,12 +8,12 @@ @pytest.mark.parametrize( - "columns", [["msisdn"], ["*"], ["id", "msisdn"]], ids=lambda x: f"{x}" + "columns", [["msisdn"], ["id", "msisdn"]], ids=lambda x: f"{x}" ) def test_events_tables_union_column_names(columns): """Test that EventsTableUnion column_names property is accurate.""" etu = EventsTablesUnion( - "2016-01-01", "2016-01-02", columns=columns, tables=["events.calls"] + "2016-01-01", "2016-01-02", columns=columns, tables=["calls"] ) assert etu.head(0).columns.tolist() == etu.column_names @@ -25,7 +25,7 @@ def test_events_table_union_subscriber_ident_substitutions(ident): "2016-01-01", "2016-01-02", columns=[ident], - tables=["events.calls"], + tables=["calls"], subscriber_identifier=ident, ) assert "subscriber" == etu.head(0).columns[0] @@ -57,7 +57,7 @@ def test_get_only_sms(get_length): "2016-01-01", "2016-01-02", columns=["msisdn", "msisdn_counterpart", "datetime"], - tables="events.sms", + tables="sms", ) assert get_length(etu) == 1246 @@ -94,6 +94,6 @@ def test_get_list_of_tables(get_length): "2016-01-01", "2016-01-02", columns=["msisdn", "msisdn_counterpart", "datetime"], - tables=["events.calls", "events.sms"], + tables=["calls", "sms"], ) assert get_length(etu) == 2500 diff --git a/flowmachine/tests/test_geotable.py b/flowmachine/tests/test_geotable.py index a5627feb19..c9fdc493c0 100644 --- a/flowmachine/tests/test_geotable.py +++ b/flowmachine/tests/test_geotable.py @@ -13,7 +13,7 @@ def test_geotable_bad_params(): def test_geotable(): """Test that geotable will work with an obviously geographic table.""" - t = GeoTable("geography.admin3") + t = GeoTable("geography.admin3", columns=["geom", "admin3pcod", "admin0name"]) feature = t.to_geojson()["features"][0] assert feature["properties"]["admin0name"] == "Nepal" @@ -36,8 +36,7 @@ def test_geotable_uses_supplied_gid(): assert feature["id"] == "Sindhupalchok" -@pytest.mark.parametrize("columns", [None, ["gid", "geom"]]) -def test_geotable_column_names(columns): +def test_geotable_column_names(): """Test that column_names property matches head(0) for geotables""" - t = GeoTable("geography.admin3", columns=columns) + t = GeoTable("geography.admin3", columns=["gid", "geom"]) assert t.head(0).columns.tolist() == t.column_names diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index d5b905f96f..f162165dbb 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -9,7 +9,12 @@ import numpy as np from flowmachine.features import SubscriberLocations -from flowmachine.core import JoinToLocation, location_joined_query, make_spatial_unit +from flowmachine.core import ( + JoinToLocation, + location_joined_query, + make_spatial_unit, + Table, +) from flowmachine.core.errors import InvalidSpatialUnitError @@ -124,7 +129,7 @@ def test_join_with_polygon(get_dataframe, get_length): spatial_unit=make_spatial_unit( "polygon", region_id_column_name="admin3pcod", - geom_table="geography.admin3", + geom_table=Table("geography.admin3", columns=["admin3pcod", "geom"]), geom_column="geom", ), ) diff --git a/flowmachine/tests/test_query_union.py b/flowmachine/tests/test_query_union.py index f86d1fbe30..66a6928e14 100644 --- a/flowmachine/tests/test_query_union.py +++ b/flowmachine/tests/test_query_union.py @@ -10,7 +10,9 @@ def test_union_column_names(): """Test that Union's column_names property is accurate""" - union = Table("events.calls_20160101").union(Table("events.calls_20160102")) + union = Table("events.calls_20160101", columns=["id"]).union( + Table("events.calls_20160102"), columns=["id"] + ) assert union.head(0).columns.tolist() == union.column_names @@ -18,7 +20,7 @@ def test_union_all(get_dataframe): """ Test default union behaviour keeps duplicates. """ - q1 = Table(schema="events", name="calls") + q1 = Table(schema="events", name="calls", columns=["id"]) union_all = q1.union(q1) union_all_df = get_dataframe(union_all) single_id = union_all_df[union_all_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"] @@ -29,7 +31,7 @@ def test_union(get_dataframe): """ Test union with all set to false dedupes. """ - q1 = Table(schema="events", name="calls") + q1 = Table(schema="events", name="calls", columns=["id"]) union = q1.union(q1, all=False) union_df = get_dataframe(union) single_id = union_df[union_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"] diff --git a/flowmachine/tests/test_random.py b/flowmachine/tests/test_random.py index 3903a1d0da..f4c70339d7 100644 --- a/flowmachine/tests/test_random.py +++ b/flowmachine/tests/test_random.py @@ -290,7 +290,7 @@ def test_pickling(): ss1 = UniqueSubscribers(start="2016-01-01", stop="2016-01-04").random_sample( size=10, sampling_method="system_rows" ) - ss2 = Table("events.calls").random_sample( + ss2 = Table("events.calls", columns=["id"]).random_sample( size=10, sampling_method="bernoulli", seed=0.73 ) for ss in [ss1, ss2]: diff --git a/flowmachine/tests/test_raster_statistics.py b/flowmachine/tests/test_raster_statistics.py index ce52185e06..d09ef605c6 100644 --- a/flowmachine/tests/test_raster_statistics.py +++ b/flowmachine/tests/test_raster_statistics.py @@ -17,7 +17,7 @@ def test_computes_expected_clipping_values(get_dataframe): RasterStatistics() returns correct values when clipping vector and raster layers. """ G = "admin2pcod" - vector = Table(schema="geography", name="admin2") + vector = Table(schema="geography", name="admin2", columns=["admin3pcod", "geom"]) r = RasterStatistics( raster="population.small_nepal_raster", vector=vector, grouping_element=G ) @@ -43,7 +43,7 @@ def test_raises_notimplemented_when_wrong_statistic_requested(): RasterStatistics() raises NotImplementedError if wrong statistic requested. """ G = "admin2pcod" - vector = Table(schema="geography", name="admin2") + vector = Table(schema="geography", name="admin2", columns=["admin2pcod", "geom"]) with pytest.raises(NotImplementedError): r = RasterStatistics( raster="population.small_nepal_raster", @@ -58,7 +58,7 @@ def test_raises_valueerror_when_grouping_element_not_provided(): RasterStatistics() raises ValueError when `grouping_element` not provided. """ G = None - vector = Table(schema="geography", name="admin2") + vector = Table(schema="geography", name="admin2", columns=["admin2pcod", "geom"]) with pytest.raises(ValueError): r = RasterStatistics( "population.small_nepal_raster", vector=vector, grouping_element=None @@ -79,7 +79,7 @@ def test_raster_statistics_column_names_vector(get_dataframe): Test that column_names property matches head(0) for RasterStatistics when vector is not None """ - vector = Table(schema="geography", name="admin2") + vector = Table(schema="geography", name="admin2", columns=["admin2pcod", "geom"]) r = RasterStatistics( raster="population.small_nepal_raster", vector=vector, diff --git a/flowmachine/tests/test_redacted_total_events.py b/flowmachine/tests/test_redacted_total_events.py index d1e8cbbc6c..84550c99f9 100644 --- a/flowmachine/tests/test_redacted_total_events.py +++ b/flowmachine/tests/test_redacted_total_events.py @@ -23,13 +23,13 @@ def test_all_above_threshold(get_dataframe): "2016-01-02", spatial_unit=make_spatial_unit("cell"), interval="day", - table=["events.calls"], + table=["calls"], ) ) us = get_dataframe( RedactedUniqueSubscriberCounts( unique_subscriber_counts=UniqueSubscriberCounts( - "2016-01-01", "2016-01-02", table=["events.calls"] + "2016-01-01", "2016-01-02", table=["calls"] ) ) ) diff --git a/flowmachine/tests/test_redacted_unique_subscriber_counts.py b/flowmachine/tests/test_redacted_unique_subscriber_counts.py index 02be24bfbb..368c9f0bab 100644 --- a/flowmachine/tests/test_redacted_unique_subscriber_counts.py +++ b/flowmachine/tests/test_redacted_unique_subscriber_counts.py @@ -11,7 +11,7 @@ def test_all_above_threshold(get_dataframe): """ Test that all values in the redacted query are above the redaction threshold. """ - us = UniqueSubscriberCounts("2016-01-01", "2016-01-02", table=["events.calls"]) + us = UniqueSubscriberCounts("2016-01-01", "2016-01-02", table=["calls"]) rus_df = get_dataframe(RedactedUniqueSubscriberCounts(unique_subscriber_counts=us)) us_df = get_dataframe(us) assert all(rus_df.value > 15) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index a33caadf23..2b53f52bc6 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -4,6 +4,7 @@ from flowmachine.core import CustomQuery from flowmachine.core.errors import InvalidSpatialUnitError +from flowmachine.core.infrastructure_table import SitesTable from flowmachine.core.spatial_unit import * import pytest @@ -47,7 +48,7 @@ def test_get_geom_query_column_names( { "spatial_unit_type": "polygon", "region_id_column_name": "id", - "geom_table": "infrastructure.sites", + "geom_table": SitesTable(), "geom_column": "geom_point", }, ["id"], @@ -56,7 +57,7 @@ def test_get_geom_query_column_names( { "spatial_unit_type": "polygon", "region_id_column_name": ["id"], - "geom_table": "infrastructure.sites", + "geom_table": SitesTable(), "geom_column": "geom_point", }, ["id"], @@ -97,7 +98,7 @@ def test_polygon_spatial_unit_column_list(): passed_cols = ["id"] psu = PolygonSpatialUnit( geom_table_column_names=passed_cols, - geom_table="infrastructure.sites", + geom_table=SitesTable(), geom_column="geom_point", ) loc_cols = psu.location_id_columns @@ -127,7 +128,9 @@ def test_missing_location_columns_raises_error(): { "spatial_unit_type": "polygon", "region_id_column_name": "id", - "geom_table": "infrastructure.sites", + "geom_table": Table( + "infrastructure.sites", columns=["id", "geom_point"] + ), "geom_column": "geom_point", }, "polygon", @@ -158,12 +161,12 @@ def test_canonical_names(make_spatial_unit_args, expected_name): { "spatial_unit_type": "polygon", "region_id_column_name": "admin3pcod", - "geom_table": "geography.admin3", + "geom_table": Table("geography.admin3", columns=["admin3pcod", "geom"]), }, { "spatial_unit_type": "polygon", "region_id_column_name": "id", - "geom_table": "infrastructure.sites", + "geom_table": Table("infrastructure.sites", columns=["id", "geom_point"]), "geom_column": "geom_point", }, ], diff --git a/flowmachine/tests/test_subscriber_call_durations.py b/flowmachine/tests/test_subscriber_call_durations.py index 86809a177a..06164854e9 100644 --- a/flowmachine/tests/test_subscriber_call_durations.py +++ b/flowmachine/tests/test_subscriber_call_durations.py @@ -4,6 +4,7 @@ import pytest +from flowmachine.core import make_spatial_unit, Table from flowmachine.features.subscriber.subscriber_call_durations import * from flowmachine.utils import Statistic @@ -48,7 +49,9 @@ def test_polygon_tables(get_dataframe): "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit( - "polygon", geom_table="geography.admin3", region_id_column_name="admin3name" + "polygon", + geom_table=Table("geography.admin3", columns=["geom", "admin3name"]), + region_id_column_name="admin3name", ), ) df = get_dataframe(per_location_durations) @@ -67,7 +70,9 @@ def test_polygon_tables(get_dataframe): "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit( - "polygon", geom_table="geography.admin3", region_id_column_name="admin3name" + "polygon", + geom_table=Table("geography.admin3", columns=["geom", "admin3name"]), + region_id_column_name="admin3name", ), ) diff --git a/flowmachine/tests/test_subscriber_degree.py b/flowmachine/tests/test_subscriber_degree.py index 0f83858941..d9e8b9f71b 100644 --- a/flowmachine/tests/test_subscriber_degree.py +++ b/flowmachine/tests/test_subscriber_degree.py @@ -25,12 +25,8 @@ def test_returns_correct_values(get_dataframe): """ # We expect subscriber '2Dq97XmPqvL6noGk' to have a single event in df1 # and two events in df2 (due to the larger time interval). - ud1 = SubscriberDegree( - "2016-01-01 12:35:00", "2016-01-01 12:40:00", tables="events.sms" - ) - ud2 = SubscriberDegree( - "2016-01-01 12:28:00", "2016-01-01 12:40:00", tables="events.sms" - ) + ud1 = SubscriberDegree("2016-01-01 12:35:00", "2016-01-01 12:40:00", tables="sms") + ud2 = SubscriberDegree("2016-01-01 12:28:00", "2016-01-01 12:40:00", tables="sms") df1 = get_dataframe(ud1).set_index("subscriber") df2 = get_dataframe(ud2).set_index("subscriber") @@ -49,13 +45,13 @@ def test_returns_correct_in_out_values(get_dataframe): ud1 = SubscriberDegree( "2016-01-01 12:35:00", "2016-01-01 12:40:00", - tables="events.sms", + tables="sms", direction="in", ) ud2 = SubscriberDegree( "2016-01-01 12:28:00", "2016-01-01 12:40:00", - tables="events.sms", + tables="sms", direction="out", ) diff --git a/flowmachine/tests/test_subscriber_event_count.py b/flowmachine/tests/test_subscriber_event_count.py index d0d14acc4b..dd4a521a1d 100644 --- a/flowmachine/tests/test_subscriber_event_count.py +++ b/flowmachine/tests/test_subscriber_event_count.py @@ -19,20 +19,16 @@ def test_event_count(get_dataframe): "2016-01-01", "2016-01-08", direction="both", - tables=["events.calls", "events.sms", "events.mds", "events.topups"], + tables=["calls", "sms", "mds", "topups"], ) df = get_dataframe(query).set_index("subscriber") assert df.loc["DzpZJ2EaVQo2X5vM"].value == 69 - query = EventCount( - "2016-01-01", "2016-01-08", direction="both", tables=["events.mds"] - ) + query = EventCount("2016-01-01", "2016-01-08", direction="both", tables=["mds"]) df = get_dataframe(query).set_index("subscriber") assert df.loc["E0LZAa7AyNd34Djq"].value == 8 - query = EventCount( - "2016-01-01", "2016-01-08", direction="both", tables="events.mds" - ) + query = EventCount("2016-01-01", "2016-01-08", direction="both", tables="mds") df = get_dataframe(query).set_index("subscriber") assert df.loc["E0LZAa7AyNd34Djq"].value == 8 @@ -80,6 +76,4 @@ def test_directed_count_undirected_tables_raises(): Test that requesting directed counts of undirected tables raises warning and errors. """ with pytest.raises(ValueError): - query = EventCount( - "2016-01-01", "2016-01-08", direction="out", tables=["events.mds"] - ) + query = EventCount("2016-01-01", "2016-01-08", direction="out", tables=["mds"]) diff --git a/flowmachine/tests/test_subscriber_event_type_proportion.py b/flowmachine/tests/test_subscriber_event_type_proportion.py index d14e139cce..50f10afedf 100644 --- a/flowmachine/tests/test_subscriber_event_type_proportion.py +++ b/flowmachine/tests/test_subscriber_event_type_proportion.py @@ -10,12 +10,12 @@ @pytest.mark.parametrize( "numerator, numerator_direction, msisdn, want", [ - ("events.calls", "both", "AgB6KR3Levd9Z1vJ", 0.351_852), - ("events.calls", "in", "AgB6KR3Levd9Z1vJ", 0.203_703_703_7), - ("events.sms", "both", "7ra3xZakjEqB1Al5", 0.362_069), - ("events.mds", "both", "QrAlXqDbXDkNJe3E", 0.236_363_63), - ("events.topups", "both", "bKZLwjrMQG7z468y", 0.183_098_5), - (["events.calls", "events.sms"], "both", "AgB6KR3Levd9Z1vJ", 0.648_148_1), + ("calls", "both", "AgB6KR3Levd9Z1vJ", 0.351_852), + ("calls", "in", "AgB6KR3Levd9Z1vJ", 0.203_703_703_7), + ("sms", "both", "7ra3xZakjEqB1Al5", 0.362_069), + ("mds", "both", "QrAlXqDbXDkNJe3E", 0.236_363_63), + ("topups", "both", "bKZLwjrMQG7z468y", 0.183_098_5), + (["calls", "sms"], "both", "AgB6KR3Levd9Z1vJ", 0.648_148_1), ], ) def test_proportion_event_type( @@ -30,11 +30,11 @@ def test_proportion_event_type( numerator, numerator_direction=numerator_direction, tables=[ - "events.calls", - "events.sms", - "events.mds", - "events.topups", - "events.forwards", + "calls", + "sms", + "mds", + "topups", + "forwards", ], ) df = get_dataframe(query).set_index("subscriber") diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 8df3e105df..62f7ce249a 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -182,7 +182,10 @@ def test_different_call_days_format(get_dataframe): cd.store().result() har = get_dataframe( - HartiganCluster(calldays=Table(cd.fully_qualified_table_name), radius=50) + HartiganCluster( + calldays=Table(cd.fully_qualified_table_name, columns=cd.column_names), + radius=50, + ) ) assert isinstance(har, pd.DataFrame) diff --git a/flowmachine/tests/test_subscriber_location_subset.py b/flowmachine/tests/test_subscriber_location_subset.py index 313ceb6379..155955c90d 100644 --- a/flowmachine/tests/test_subscriber_location_subset.py +++ b/flowmachine/tests/test_subscriber_location_subset.py @@ -28,7 +28,7 @@ def test_subscribers_make_atleast_one_call_in_admin0(): sls = SubscriberLocationSubset( start, stop, min_calls=1, spatial_unit=make_spatial_unit("admin", level=0) ) - us = UniqueSubscribers(start, stop, table="events.calls") + us = UniqueSubscribers(start, stop, table="calls") sls_subs = set(sls.get_dataframe()["subscriber"]) us_subs = set(us.get_dataframe()["subscriber"]) @@ -43,9 +43,9 @@ def test_subscribers_who_make_atleast_3_calls_in_central_development_region(): within Central Development admin1 region. """ start, stop = "2016-01-01", "2016-01-07" - regions = Table("admin2", "geography").subset( - "admin1name", ["Central Development Region"] - ) + regions = Table( + "admin2", "geography", columns=["admin1name", "admin2pcod", "geom"] + ).subset("admin1name", ["Central Development Region"]) sls = SubscriberLocationSubset( start, diff --git a/flowmachine/tests/test_subscriber_locations.py b/flowmachine/tests/test_subscriber_locations.py index 8f19dba31c..e1b151e2cb 100644 --- a/flowmachine/tests/test_subscriber_locations.py +++ b/flowmachine/tests/test_subscriber_locations.py @@ -4,7 +4,7 @@ import pytest -from flowmachine.core import make_spatial_unit +from flowmachine.core import make_spatial_unit, Table from flowmachine.features.utilities.subscriber_locations import SubscriberLocations pytestmark = pytest.mark.usefixtures("skip_datecheck") @@ -31,7 +31,9 @@ def test_can_get_pcods(get_dataframe): "2016-01-01 13:30:30", "2016-01-02 16:25:00", spatial_unit=make_spatial_unit( - "polygon", region_id_column_name="admin3pcod", geom_table="geography.admin3" + "polygon", + region_id_column_name="admin3pcod", + geom_table=Table("geography.admin3", columns=["admin3pcod", "geom"]), ), ) df = get_dataframe(subscriber_pcod) diff --git a/flowmachine/tests/test_subscriber_subsetting.py b/flowmachine/tests/test_subscriber_subsetting.py index 85eba01566..ebb282ccb5 100644 --- a/flowmachine/tests/test_subscriber_subsetting.py +++ b/flowmachine/tests/test_subscriber_subsetting.py @@ -22,12 +22,12 @@ @pytest.mark.parametrize( - "columns", [["msisdn"], ["*"], ["id", "msisdn"]], ids=lambda x: f"{x}" + "columns", [["msisdn"], None, ["id", "msisdn"]], ids=lambda x: f"{x}" ) def test_events_table_subset_column_names(columns): """Test that EventTableSubset column_names property is accurate.""" etu = EventTableSubset( - start="2016-01-01", stop="2016-01-02", columns=columns, table="events.calls" + start="2016-01-01", stop="2016-01-02", columns=columns, table="calls" ) assert etu.head(0).columns.tolist() == etu.column_names @@ -39,7 +39,7 @@ def test_events_table_subscriber_ident_substitutions(ident): start="2016-01-01", stop="2016-01-02", columns=[ident], - table="events.calls", + table="calls", subscriber_identifier=ident, ) assert "subscriber" == etu.head(0).columns[0] @@ -73,7 +73,7 @@ def subscriber_list_table(subscriber_list, flowmachine_connect): formatted_subscribers ) trans.exec_driver_sql(sql) - subs_table = Table("subscriber_list") + subs_table = Table("subscriber_list", columns=["subscriber"]) yield subs_table subs_table.invalidate_db_cache(drop=True) diff --git a/flowmachine/tests/test_subsetting.py b/flowmachine/tests/test_subsetting.py index 24e1d1dae3..df581dbb92 100644 --- a/flowmachine/tests/test_subsetting.py +++ b/flowmachine/tests/test_subsetting.py @@ -177,7 +177,7 @@ def test_subset_subset(get_dataframe): sub_vala = "Central Development Region" sub_colb = "admin2name" sub_valb = "Bagmati" - t = Table("geography.admin3") + t = Table("geography.admin3", columns=[sub_cola, sub_colb]) t_df = get_dataframe(t) sub_q = t.subset(sub_cola, sub_vala).subset(sub_colb, sub_valb) @@ -199,7 +199,7 @@ def test_subset_subsetnumeric(get_dataframe): sub_colb = "shape_area" sub_lowb = 0.1 sub_highb = 0.12 - t = Table("geography.admin3") + t = Table("geography.admin3", columns=[sub_cola, sub_colb]) t_df = get_dataframe(t) sub_q1 = t.subset(sub_cola, sub_vala).numeric_subset(sub_colb, sub_lowb, sub_highb) @@ -227,7 +227,7 @@ def test_subsetnumeric_subsetnumeric(get_dataframe): sub_colb = "shape_leng" sub_lowb = 1.0 sub_highb = 2.0 - t = Table("geography.admin3") + t = Table("geography.admin3", columns=[sub_cola, sub_colb]) t_df = get_dataframe(t) sub_q = t.numeric_subset(sub_cola, sub_lowa, sub_lowb).numeric_subset( diff --git a/flowmachine/tests/test_table.py b/flowmachine/tests/test_table.py index 51955abf26..f2d6fd31da 100644 --- a/flowmachine/tests/test_table.py +++ b/flowmachine/tests/test_table.py @@ -3,34 +3,48 @@ import pytest from flowmachine.core import Table +from flowmachine.core.errors.flowmachine_errors import QueryErroredException -@pytest.mark.parametrize("columns", [None, ["msisdn", "id"]]) +@pytest.mark.parametrize("columns", [["msisdn", "id"]]) def test_table_column_names(columns): """Test that column_names property matches head(0) for tables""" t = Table("events.calls", columns=columns) assert t.head(0).columns.tolist() == t.column_names -def test_table_init(): +@pytest.mark.parametrize( + "args", + [ + dict(name="events.calls", schema="extra_schema", columns=["id"]), + dict(name="calls", schema="events", columns=None), + dict(name="calls", schema="events", columns=[]), + ], +) +def test_table_init(args): """ Test that table creation handles params properly. """ - t = Table("events.calls") with pytest.raises(ValueError): - Table("events.calls", "moose") - with pytest.raises(ValueError): - Table("events.calls", columns="NO SUCH COLUMN") - with pytest.raises(ValueError): - Table("NOSUCHTABLE") - with pytest.raises(ValueError): - Table("events.WHAAAAAAAAT") + Table(**args) + + +@pytest.mark.parametrize( + "args", + [ + dict(name="events.calls", columns=["NO SUCH COLUMN"]), + dict(name="NO SUCH TABLE", columns=["id"]), + ], +) +def test_table_preflight(args): + with pytest.raises(QueryErroredException): + Table(**args).preflight() def public_schema_checked(): """Test that where no schema is provided, public schema is checked.""" - t = Table("gambia_admin2") + t = Table("gambia_admin2", columns=["geom"]).preflight() def test_children(): @@ -38,8 +52,8 @@ def test_children(): Test that table inheritance is correctly detected. """ - assert Table("events.calls").has_children() - assert not Table("geography.admin3").has_children() + assert Table("events.calls", columns=["id"]).has_children() + assert not Table("geography.admin3", columns=["geom"]).has_children() def test_columns(): @@ -54,7 +68,7 @@ def test_store_with_table(): """ Test that a subset of a table can be stored. """ - t = Table("events.calls") + t = Table("events.calls", columns=["id"]) s = t.subset("id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"]) s.store().result() assert s.is_stored @@ -76,7 +90,7 @@ def test_dependencies(): Check that a table without explicit columns has no other queries as a dependency, and a table with explicit columns has its parent table as a dependency. """ - t1 = Table("events.calls") + t1 = Table("events.calls", columns=["id"]) assert t1.dependencies == set() t2 = Table("events.calls", columns=["id"]) @@ -89,7 +103,7 @@ def test_subset(): """ Test that a subset of a table doesn't show as stored. """ - ss = Table("events.calls").subset( + ss = Table("events.calls", columns=["id"]).subset( "id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"] ) assert not ss.is_stored @@ -99,7 +113,7 @@ def test_pickling(): """ Test that we can pickle and unpickle subset classes. """ - ss = Table("events.calls").subset( + ss = Table("events.calls", columns=["id"]).subset( "id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"] ) assert ss.get_query() == pickle.loads(pickle.dumps(ss)).get_query() diff --git a/flowmachine/tests/test_total_location_events.py b/flowmachine/tests/test_total_location_events.py index 264b2fc548..c671ad164e 100644 --- a/flowmachine/tests/test_total_location_events.py +++ b/flowmachine/tests/test_total_location_events.py @@ -56,7 +56,7 @@ def test_ignore_texts(get_dataframe): "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"), - table="events.calls", + table="calls", ) df = get_dataframe(te) From 1b4b901c1fe536856b5f0fe4f912bf3f446f9ae3 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 2 Oct 2020 12:42:55 +0100 Subject: [PATCH 12/89] Fix not running preflight before store, --- flowmachine/flowmachine/core/cache.py | 20 +++++++++++++++----- flowmachine/flowmachine/core/query.py | 3 ++- flowmachine/tests/test_cache.py | 1 + 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 7fa5d0dd4d..50638a905e 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -475,6 +475,18 @@ def get_query_object_by_id(connection: "Connection", query_id: str) -> "Query": raise ValueError(f"Query id '{query_id}' is not in cache on this connection.") +def _get_protected_classes(): + from flowmachine.core.events_table import events_table_map + from flowmachine.core.infrastructure_table import infrastructure_table_map + + return [ + "Table", + "GeoTable", + *[cls.__name__ for cls in events_table_map.values()], + *[cls.__name__ for cls in infrastructure_table_map.values()], + ] + + def get_cached_query_objects_ordered_by_score( connection: "Connection", protected_period: Optional[int] = None, @@ -496,8 +508,6 @@ def get_cached_query_objects_ordered_by_score( Returns a list of cached Query objects with their on disk sizes """ - from flowmachine.core.events_table import events_table_map - from flowmachine.core.infrastructure_table import infrastructure_table_map protected_period_clause = ( (f" AND NOW()-created > INTERVAL '{protected_period} seconds'") @@ -506,7 +516,7 @@ def get_cached_query_objects_ordered_by_score( ) qry = f"""SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE NOT (cached.class=ANY(ARRAY{['Table', 'GeoTable', *[cls.__name__ for cls in events_table_map.values()], *[cls.__name__ for cls in infrastructure_table_map.values()]]})) + WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()})) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ @@ -686,9 +696,9 @@ def get_size_of_cache(connection: "Connection") -> int: Number of bytes in total used by cache tables """ - sql = """SELECT sum(table_size(tablename, schema)) as total_bytes + sql = f"""SELECT sum(table_size(tablename, schema)) as total_bytes FROM cache.cached - WHERE cached.class!='Table' AND cached.class!='GeoTable'""" + WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()}""" cache_bytes = connection.fetch(sql)[0][0] return 0 if cache_bytes is None else int(cache_bytes) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index 7b7fb8c573..f25fafb412 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -535,8 +535,9 @@ def _make_sql(self, name: str, schema: Union[str, None] = None) -> List[str]: logger.info("Table already exists") return [] + q_string = self._make_query() Q = f"""EXPLAIN (ANALYZE TRUE, TIMING FALSE, FORMAT JSON) CREATE TABLE {full_name} AS - (SELECT {self.column_names_as_string_list} FROM ({self._make_query()}) _)""" + (SELECT {self.column_names_as_string_list} FROM ({q_string}) _)""" queries.append(Q) # Make flowmachine user the owner to allow server to cleanup cache tables queries.append(f"ALTER TABLE {full_name} OWNER TO flowmachine;") diff --git a/flowmachine/tests/test_cache.py b/flowmachine/tests/test_cache.py index 68efd8ad92..3f757ea2d6 100644 --- a/flowmachine/tests/test_cache.py +++ b/flowmachine/tests/test_cache.py @@ -27,6 +27,7 @@ def test_table_records_removed(flowmachine_connect): dl.store().result() assert dl.is_stored table = dl.get_table() + table.preflight() assert cache_table_exists(get_db(), table.query_id) dl.invalidate_db_cache() From a9e07578c64ad49b86d5b7939dbf6733f4c26f9e Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 2 Oct 2020 13:41:20 +0100 Subject: [PATCH 13/89] Syntax, fix tests --- flowmachine/flowmachine/core/cache.py | 2 +- .../features/utilities/event_table_subset.py | 26 ++++++------------- flowmachine/tests/test_cache_utils.py | 1 + 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 50638a905e..15a23052bb 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -698,7 +698,7 @@ def get_size_of_cache(connection: "Connection") -> int: """ sql = f"""SELECT sum(table_size(tablename, schema)) as total_bytes FROM cache.cached - WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()}""" + WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()}))""" cache_bytes = connection.fetch(sql)[0][0] return 0 if cache_bytes is None else int(cache_bytes) diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index f24bb5b26d..4b755214af 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -149,16 +149,6 @@ def __init__( super().__init__() - @pre_flight - def create_table_def(self): - logger.debug("Creating sql alchemy table.") - # This needs to happen after the parent classes init method has been - # called as it relies upon the connection object existing - self.sqlalchemy_table = get_sqlalchemy_table_definition( - self.table_ORIG.fully_qualified_table_name, - engine=get_db().engine, - ) - @property def column_names(self) -> List[str]: return [c.split(" AS ")[-1] for c in self.columns] @@ -224,25 +214,25 @@ def check_dates(self): ) def _make_query_with_sqlalchemy(self): + sqlalchemy_table = get_sqlalchemy_table_definition( + self.table_ORIG.fully_qualified_table_name, + engine=get_db().engine, + ) sqlalchemy_columns = [ make_sqlalchemy_column_from_flowmachine_column_description( - self.sqlalchemy_table, column_str + sqlalchemy_table, column_str ) for column_str in self.columns ] select_stmt = select(*sqlalchemy_columns) if self.start is not None: - select_stmt = select_stmt.where( - self.sqlalchemy_table.c.datetime >= self.start - ) + select_stmt = select_stmt.where(sqlalchemy_table.c.datetime >= self.start) if self.stop is not None: - select_stmt = select_stmt.where( - self.sqlalchemy_table.c.datetime < self.stop - ) + select_stmt = select_stmt.where(sqlalchemy_table.c.datetime < self.stop) select_stmt = select_stmt.where( - self.hour_slices.get_subsetting_condition(self.sqlalchemy_table.c.datetime) + self.hour_slices.get_subsetting_condition(sqlalchemy_table.c.datetime) ) select_stmt = self.subscriber_subsetter.apply_subset_if_needed( select_stmt, subscriber_identifier=self.subscriber_identifier diff --git a/flowmachine/tests/test_cache_utils.py b/flowmachine/tests/test_cache_utils.py index 3b69091be6..7249b0db2b 100644 --- a/flowmachine/tests/test_cache_utils.py +++ b/flowmachine/tests/test_cache_utils.py @@ -526,6 +526,7 @@ def test_cache_reset_protects_tables(flowmachine_connect): """ # Regression test for https://github.com/Flowminder/FlowKit/issues/832 dl_query = daily_location(date="2016-01-03", method="last") + dl_query.preflight() reset_cache(get_db(), get_redis()) for dep in dl_query._get_stored_dependencies(): assert dep.query_id in [x.query_id for x in Query.get_stored()] From b6c841332b53575e4529f4941314176fcf34c35d Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 2 Oct 2020 13:49:57 +0100 Subject: [PATCH 14/89] Update test to refelect one less link in dep chain --- flowmachine/tests/test_cache.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/flowmachine/tests/test_cache.py b/flowmachine/tests/test_cache.py index 3f757ea2d6..5e09d35236 100644 --- a/flowmachine/tests/test_cache.py +++ b/flowmachine/tests/test_cache.py @@ -145,7 +145,9 @@ def test_invalidate_cache_multi(flowmachine_connect): assert not cache_table_exists(get_db(), dl1.query_id) assert not cache_table_exists(get_db(), hl1.query_id) has_deps = bool(get_db().fetch("SELECT * FROM cache.dependencies")) - assert has_deps # the remaining dependencies are due to underlying Table objects + assert ( + not has_deps + ) # the remaining dependencies are due to underlying Table objects def test_invalidate_cache_midchain(flowmachine_connect): From f4802a879b454db1bb8d46baa1cca098dbfae570 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 2 Oct 2020 14:23:19 +0100 Subject: [PATCH 15/89] Disallow None for start/stop --- .../features/utilities/event_table_subset.py | 59 +++++-------------- .../features/utilities/events_tables_union.py | 7 ++- 2 files changed, 20 insertions(+), 46 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index 4b755214af..3ee4ec197c 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -3,18 +3,16 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. -import datetime -import pandas as pd import warnings from sqlalchemy import select from typing import List, Optional, Tuple -from ...core import Query, Table -from ...core.context import get_db -from ...core.errors import MissingDateError -from ...core.events_table import events_table_map -from ...core.preflight import pre_flight -from ...core.sqlalchemy_utils import ( +from flowmachine.core.query import Query +from flowmachine.core.context import get_db +from flowmachine.core.errors import MissingDateError +from flowmachine.core.events_table import events_table_map +from flowmachine.core.preflight import pre_flight +from flowmachine.core.sqlalchemy_utils import ( get_sqlalchemy_table_definition, make_sqlalchemy_column_from_flowmachine_column_description, get_sql_string, @@ -34,14 +32,10 @@ class EventTableSubset(Query): Parameters ---------- - start : str, default None - iso format date range for the beginning of the time frame, e.g. - 2016-01-01 or 2016-01-01 14:03:01. If None, it will use the - earliest date seen in the `events.calls` table. - stop : str, default None - As above. If None, it will use the latest date seen in the - `events.calls` table. - hours : tuple of ints, default None + start, stop : str + iso format date range for the beginning and end of the time frame, e.g. + 2016-01-01 or 2016-01-01 14:03:01. + hours : tuple of ints, default 'None' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but across all specified days. Or set to 'all' to include @@ -74,8 +68,8 @@ class EventTableSubset(Query): def __init__( self, *, - start=None, - stop=None, + start, + stop, hours: Optional[Tuple[int, int]] = None, hour_slices=None, table="calls", @@ -160,34 +154,15 @@ def check_dates(self): # If there are no dates present, then we raise an error # if some are present, but some are missing we raise a # warning. - # If the subscriber does not pass a start or stop date, then we take - # the min/max date in the events.calls table - if self.start is None: - d1 = ( - get_db() - .min_date(self.table_ORIG.fully_qualified_table_name.split(".")[1]) - .strftime("%Y-%m-%d %H:%M:%S") - ) - else: - d1 = self.start.split()[0] - - if self.stop is None: - d2 = ( - get_db() - .max_date(self.table_ORIG.fully_qualified_table_name.split(".")[1]) - .strftime("%Y-%m-%d %H:%M:%S") - ) - else: - d2 = self.stop.split()[0] + d1 = self.start.split()[0] + d2 = self.stop.split()[0] all_dates = list_of_dates(d1, d2) # Slightly annoying feature, but if the subscriber passes a date such as '2016-01-02' # this will be interpreted as midnight, so we don't want to include this in our # calculations. Check for this here, an if this is the case pop the final element # of the list - if (self.stop is not None) and ( - len(self.stop) == 10 or self.stop.endswith("00:00:00") - ): + if len(self.stop) == 10 or self.stop.endswith("00:00:00"): all_dates.pop(-1) # This will be a true false list for whether each of the dates # is present in the database @@ -213,7 +188,7 @@ def check_dates(self): stacklevel=2, ) - def _make_query_with_sqlalchemy(self): + def _make_query(self): sqlalchemy_table = get_sqlalchemy_table_definition( self.table_ORIG.fully_qualified_table_name, engine=get_db().engine, @@ -240,8 +215,6 @@ def _make_query_with_sqlalchemy(self): return get_sql_string(select_stmt) - _make_query = _make_query_with_sqlalchemy - @property def fully_qualified_table_name(self): # EventTableSubset are a simple select from events, and should not be cached diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index 2d63805e91..e5d5b752ec 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -1,10 +1,11 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +import datetime import structlog import warnings -from typing import List, Optional, Tuple +from typing import List, Union, Optional from ...core import Query from ...core.context import get_db @@ -43,8 +44,8 @@ class EventsTablesUnion(Query): def __init__( self, - start, - stop, + start: Union[str, datetime.date, datetime.datetime], + stop: Union[str, datetime.date, datetime.datetime], *, columns, tables=None, From 5a756921afd2f8ad93aee8e966bc0fbc2eefddd6 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 2 Oct 2020 16:57:18 +0100 Subject: [PATCH 16/89] Small fixups --- flowmachine/flowmachine/core/cache.py | 9 ++++---- .../core/errors/flowmachine_errors.py | 21 +++++++++++++++++ flowmachine/flowmachine/core/preflight.py | 23 ++++++++++++++----- flowmachine/flowmachine/core/query.py | 5 ++-- .../core/server/action_handlers.py | 14 +++++++++++ flowmachine/flowmachine/core/table.py | 1 - .../test_sql_strings_and_results.py | 4 ++-- flowmachine/tests/test_cache.py | 4 +++- flowmachine/tests/test_cache_utils.py | 6 ++++- flowmachine/tests/test_daily_location.py | 5 +++- flowmachine/tests/test_event_table_subset.py | 9 +++----- flowmachine/tests/test_query_union.py | 4 ++-- flowmachine/tests/test_raster_statistics.py | 2 +- .../tests/test_subscriber_subsetting.py | 6 +++-- flowmachine/tests/test_table.py | 15 +++++------- flowmachine/tests/test_to_sql.py | 4 ++-- 16 files changed, 92 insertions(+), 40 deletions(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 15a23052bb..ef12032eff 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -189,19 +189,20 @@ def write_query_to_cache( if this_thread_is_owner: logger.debug(f"In charge of executing '{query.query_id}'.") try: + query.preflight() query_ddl_ops = ddl_ops_func(name, schema) except Exception as exc: + q_state_machine.raise_error() logger.error(f"Error generating SQL. Error was {exc}") raise exc logger.debug("Made SQL.") con = connection.engine with con.begin() as trans: try: - plan_time = run_ops_list_and_return_execution_time( - query_ddl_ops, trans - ) + plan_time = run_ops_list_and_return_execution_time(query_ddl_ops, trans) logger.debug("Executed queries.") except Exception as exc: + q_state_machine.raise_error() logger.error(f"Error executing SQL. Error was {exc}") raise exc if schema == "cache": @@ -213,6 +214,7 @@ def write_query_to_cache( executed_sql=";\n".join(query_ddl_ops), ) except Exception as exc: + q_state_machine.raise_error() logger.error(f"Error writing cache metadata. Error was {exc}") raise exc q_state_machine.finish() @@ -223,7 +225,6 @@ def write_query_to_cache( finally: if this_thread_is_owner and not q_state_machine.is_finished_executing: q_state_machine.cancel() - q_state_machine.wait_until_complete(sleep_duration=sleep_duration) if q_state_machine.is_completed: return query diff --git a/flowmachine/flowmachine/core/errors/flowmachine_errors.py b/flowmachine/flowmachine/core/errors/flowmachine_errors.py index 60ce8fccb9..ef23a34dbb 100644 --- a/flowmachine/flowmachine/core/errors/flowmachine_errors.py +++ b/flowmachine/flowmachine/core/errors/flowmachine_errors.py @@ -6,6 +6,27 @@ """ Custom errors raised by flowmachine. """ +from typing import List, Dict + + +class PreFlightFailedException(Exception): + """ + Exception indicating that preflight checks for a query failed. + + Parameters + ---------- + query_id : str + Identifier of the query + failures : dict + Mapping from query reps to lists of exceptions raised in preflight + """ + + def __init__(self, query_id: str, errors: Dict[str, List[Exception]]): + self.errors = errors + self.query_id = query_id + Exception.__init__( + self, f"Pre-flight failed for '{self.query_id}'. Errors: {errors}" + ) class StoreFailedException(Exception): diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py index c994133222..18cfad3511 100644 --- a/flowmachine/flowmachine/core/preflight.py +++ b/flowmachine/flowmachine/core/preflight.py @@ -12,7 +12,10 @@ get_dependency_links, _assemble_dependency_graph, ) -from flowmachine.core.errors.flowmachine_errors import QueryErroredException +from flowmachine.core.errors.flowmachine_errors import ( + QueryErroredException, + PreFlightFailedException, +) logger = structlog.get_logger("flowmachine.debug", submodule=__name__) @@ -64,6 +67,7 @@ def resolve_hooks(cls) -> typing.Dict[str, typing.List[typing.Callable]]: class Preflight: def preflight(self): + logger.debug("Starting pre-flight checks.", query=str(self)) errors = dict() dep_graph = _assemble_dependency_graph( dependencies=get_dependency_links(self), @@ -75,14 +79,21 @@ def preflight(self): for dependency in deps: for hook in resolve_hooks(dependency.__class__)["pre_flight"]: + logger.debug( + "Running hook", + query=str(self), + hook=hook, + dependency=str(dependency), + ) try: getattr(dependency, hook)() except Exception as e: - errors.setdefault(dependency.query_id, []).append(e) + errors.setdefault(str(dependency), []).append(e) if len(errors) > 0: logger.debug( - "Pre-flight failed.", query=self, query_id=self.query_id, errors=errors - ) - raise QueryErroredException( - f"Pre-flight failed for '{self.query_id}'. Errors: {errors}" + "Pre-flight failed.", + query=str(self), + query_id=self.query_id, + errors=errors, ) + raise PreFlightFailedException(self.query_id, errors) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index f25fafb412..d30609f6f9 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -263,7 +263,6 @@ def get_query(self): SQL query string. """ - self.preflight() try: table_name = self.fully_qualified_table_name schema, name = table_name.split(".") @@ -389,9 +388,11 @@ def get_table(self): The stored version of this Query as a Table object """ if self.is_stored: - return flowmachine.core.Table( + table = flowmachine.core.Table( self.fully_qualified_table_name, columns=self.column_names ) + table.preflight() + return table else: raise ValueError(f"{self} not stored on this connection.") diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index 8f53e5d809..ce1c516cc8 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -41,6 +41,7 @@ from ..connection import MissingCheckError from ..dependency_graph import query_progress +from ..errors.flowmachine_errors import PreFlightFailedException async def action_handler__ping(config: "FlowmachineServerConfig") -> ZMQReply: @@ -80,6 +81,19 @@ async def action_handler__get_query_schemas( def _load_query_object(params: dict) -> "BaseExposedQuery": try: query_obj = FlowmachineQuerySchema().load(params) + query_obj = FlowmachineQuerySchema().load(action_params) + query_obj.preflight() # Note that we probably want to remove this call to allow getting qid faster + except PreFlightFailedException as exc: + orig_error_msg = exc.args[0] + error_msg = ( + f"Internal flowmachine server error: could not create query object using query schema. " + f"The original error was: '{orig_error_msg}'" + ) + raise QueryLoadError( + error_msg, + params, + orig_error_msg=orig_error_msg, + ) except TypeError as exc: # We need to catch TypeError here, otherwise they propagate up to # perform_action() and result in a very misleading error message. diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 4a80584c04..717071893b 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -145,7 +145,6 @@ def _make_query(self): return "SELECT {cols} FROM {fqn}".format(fqn=self.fqn, cols=cols) def get_query(self): - self.preflight() with get_db().engine.begin() as trans: trans.exec_driver_sql( f"UPDATE cache.cached SET last_accessed = NOW(), access_count = access_count + 1 WHERE query_id ='{self.query_id}'" diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index c69abb2276..bee087afb2 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -124,7 +124,7 @@ def test_daily_location_4_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - table="events.calls", + table="calls", hours=(22, 6), subscriber_subset=subset_query, ) @@ -142,7 +142,7 @@ def test_daily_location_4_df(get_dataframe, diff_reporter): ) dl = daily_location( "2016-01-05", - table="events.calls", + table="calls", hours=(22, 6), subscriber_subset=subset_query, ) diff --git a/flowmachine/tests/test_cache.py b/flowmachine/tests/test_cache.py index 5e09d35236..9222daf930 100644 --- a/flowmachine/tests/test_cache.py +++ b/flowmachine/tests/test_cache.py @@ -27,7 +27,6 @@ def test_table_records_removed(flowmachine_connect): dl.store().result() assert dl.is_stored table = dl.get_table() - table.preflight() assert cache_table_exists(get_db(), table.query_id) dl.invalidate_db_cache() @@ -40,6 +39,7 @@ def test_do_cache_simple(flowmachine_connect): """ dl1 = daily_location("2016-01-01") + dl1.preflight() with get_db().engine.begin() as trans: write_cache_metadata(trans, dl1) assert cache_table_exists(get_db(), dl1.query_id) @@ -52,6 +52,7 @@ def test_do_cache_multi(flowmachine_connect): """ hl1 = ModalLocation(daily_location("2016-01-01"), daily_location("2016-01-02")) + hl1.preflight() with get_db().engine.begin() as trans: write_cache_metadata(trans, hl1) @@ -66,6 +67,7 @@ def test_do_cache_nested(flowmachine_connect): hl1 = ModalLocation(daily_location("2016-01-01"), daily_location("2016-01-02")) hl2 = ModalLocation(daily_location("2016-01-03"), daily_location("2016-01-04")) flow = Flows(hl1, hl2) + flow.preflight() with get_db().engine.begin() as trans: write_cache_metadata(trans, flow) diff --git a/flowmachine/tests/test_cache_utils.py b/flowmachine/tests/test_cache_utils.py index 7249b0db2b..24773894ce 100644 --- a/flowmachine/tests/test_cache_utils.py +++ b/flowmachine/tests/test_cache_utils.py @@ -77,6 +77,7 @@ def test_touch_cache_record_for_query(flowmachine_connect): """ Touching a cache record for a query should update access count, last accessed, & counter. """ + initial_touches = get_db().fetch("SELECT nextval('cache.cache_touches');")[0][0] table = daily_location("2016-01-01").store().result() assert ( @@ -96,7 +97,10 @@ def test_touch_cache_record_for_query(flowmachine_connect): )[0][0] ) # Two cache touches should have been recorded - assert 4 == get_db().fetch("SELECT nextval('cache.cache_touches');")[0][0] + assert ( + initial_touches + 2 + == get_db().fetch("SELECT nextval('cache.cache_touches');")[0][0] + ) assert ( accessed_at < get_db().fetch( diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index be206862d4..f8bdcd40af 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -6,6 +6,7 @@ from flowmachine.core.errors import MissingDateError from flowmachine.core import make_spatial_unit +from flowmachine.core.errors.flowmachine_errors import PreFlightFailedException from flowmachine.features import daily_location, MostFrequentLocation @@ -77,5 +78,7 @@ def test_daily_locs_errors(): daily_location() errors when we ask for a date that does not exist. """ - with pytest.raises(MissingDateError): + with pytest.raises(PreFlightFailedException) as exc: daily_location("2016-01-31").preflight() + with pytest.raises(MissingDateError): + raise exc.value.errors[str(daily_location("2016-01-31"))][0] diff --git a/flowmachine/tests/test_event_table_subset.py b/flowmachine/tests/test_event_table_subset.py index 4e1af4b09f..a88f2910c0 100644 --- a/flowmachine/tests/test_event_table_subset.py +++ b/flowmachine/tests/test_event_table_subset.py @@ -14,6 +14,7 @@ import flowmachine.core from flowmachine.core.errors import MissingDateError +from flowmachine.core.errors.flowmachine_errors import PreFlightFailedException from flowmachine.features.utilities.event_table_subset import EventTableSubset @@ -47,7 +48,7 @@ def test_warns_on_missing(): """ message = "115 of 122 calendar dates missing. Earliest date is 2016-01-01, latest is 2016-01-07" with pytest.warns(UserWarning, match=message): - EventTableSubset(start="2016-01-01", stop="2016-05-02") + EventTableSubset(start="2016-01-01", stop="2016-05-02").preflight() @pytest.mark.check_available_dates @@ -55,12 +56,8 @@ def test_error_on_all_missing(): """ Date subsetter should error when all dates are missing. """ - with pytest.raises(MissingDateError): + with pytest.raises(PreFlightFailedException) as exc: EventTableSubset(start="2016-05-01", stop="2016-05-02").preflight() - with pytest.raises(MissingDateError): - EventTableSubset( - start="2016-05-01", stop="2016-05-02", table="topups" - ).preflight() def test_handles_mins(get_dataframe): diff --git a/flowmachine/tests/test_query_union.py b/flowmachine/tests/test_query_union.py index 66a6928e14..0e483f74e1 100644 --- a/flowmachine/tests/test_query_union.py +++ b/flowmachine/tests/test_query_union.py @@ -11,7 +11,7 @@ def test_union_column_names(): """Test that Union's column_names property is accurate""" union = Table("events.calls_20160101", columns=["id"]).union( - Table("events.calls_20160102"), columns=["id"] + Table("events.calls_20160102", columns=["id"]) ) assert union.head(0).columns.tolist() == union.column_names @@ -31,7 +31,7 @@ def test_union(get_dataframe): """ Test union with all set to false dedupes. """ - q1 = Table(schema="events", name="calls", columns=["id"]) + q1 = Table(schema="events", name="calls", columns=["id", "msisdn"]) union = q1.union(q1, all=False) union_df = get_dataframe(union) single_id = union_df[union_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"] diff --git a/flowmachine/tests/test_raster_statistics.py b/flowmachine/tests/test_raster_statistics.py index d09ef605c6..058583633d 100644 --- a/flowmachine/tests/test_raster_statistics.py +++ b/flowmachine/tests/test_raster_statistics.py @@ -17,7 +17,7 @@ def test_computes_expected_clipping_values(get_dataframe): RasterStatistics() returns correct values when clipping vector and raster layers. """ G = "admin2pcod" - vector = Table(schema="geography", name="admin2", columns=["admin3pcod", "geom"]) + vector = Table(schema="geography", name="admin2", columns=["admin2pcod", "geom"]) r = RasterStatistics( raster="population.small_nepal_raster", vector=vector, grouping_element=G ) diff --git a/flowmachine/tests/test_subscriber_subsetting.py b/flowmachine/tests/test_subscriber_subsetting.py index ebb282ccb5..543c764f6e 100644 --- a/flowmachine/tests/test_subscriber_subsetting.py +++ b/flowmachine/tests/test_subscriber_subsetting.py @@ -74,8 +74,10 @@ def subscriber_list_table(subscriber_list, flowmachine_connect): ) trans.exec_driver_sql(sql) subs_table = Table("subscriber_list", columns=["subscriber"]) - yield subs_table - subs_table.invalidate_db_cache(drop=True) + try: + yield subs_table + finally: + subs_table.invalidate_db_cache(drop=True) def test_cdrs_can_be_subset_by_table( diff --git a/flowmachine/tests/test_table.py b/flowmachine/tests/test_table.py index f2d6fd31da..d905e0eb19 100644 --- a/flowmachine/tests/test_table.py +++ b/flowmachine/tests/test_table.py @@ -3,7 +3,10 @@ import pytest from flowmachine.core import Table -from flowmachine.core.errors.flowmachine_errors import QueryErroredException +from flowmachine.core.errors.flowmachine_errors import ( + QueryErroredException, + PreFlightFailedException, +) @pytest.mark.parametrize("columns", [["msisdn", "id"]]) @@ -38,7 +41,7 @@ def test_table_init(args): ], ) def test_table_preflight(args): - with pytest.raises(QueryErroredException): + with pytest.raises(PreFlightFailedException): Table(**args).preflight() @@ -87,17 +90,11 @@ def test_get_table_is_self(): def test_dependencies(): """ - Check that a table without explicit columns has no other queries as a dependency, - and a table with explicit columns has its parent table as a dependency. + Check that a table has no other queries as a dependency. """ t1 = Table("events.calls", columns=["id"]) assert t1.dependencies == set() - t2 = Table("events.calls", columns=["id"]) - assert len(t2.dependencies) == 1 - t2_parent = t2.dependencies.pop() - assert "057addedac04dbeb1dcbbb6b524b43f0" == t2_parent.query_id - def test_subset(): """ diff --git a/flowmachine/tests/test_to_sql.py b/flowmachine/tests/test_to_sql.py index 5599789232..fea0fae1fb 100644 --- a/flowmachine/tests/test_to_sql.py +++ b/flowmachine/tests/test_to_sql.py @@ -46,7 +46,7 @@ def test_can_force_rewrite(flowmachine_connect, get_length): sql = """DELETE FROM tests.test_rewrite""" with get_db().engine.begin() as conn: conn.exec_driver_sql(sql) - assert 0 == get_length(Table("tests.test_rewrite")) + assert 0 == get_length(Table("tests.test_rewrite", columns=query.column_names)) query.invalidate_db_cache(name="test_rewrite", schema="tests") query.to_sql(name="test_rewrite", schema="tests").result() - assert 1 < get_length(Table("tests.test_rewrite")) + assert 1 < get_length(Table("tests.test_rewrite", columns=query.column_names)) From 9e21ef2e51eb4ccc80ad68342cf1a384dd47eb5d Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 5 Oct 2020 12:03:59 +0100 Subject: [PATCH 17/89] Fix error return on preflight fail --- .../flowmachine/core/server/action_handlers.py | 13 ++++++++++++- .../features/network/total_network_objects.py | 6 +++--- flowmachine/tests/server/test_action_handlers.py | 6 ++---- flowmachine/tests/test_total_network_objects.py | 2 ++ 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index ce1c516cc8..2ba79d1ecb 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -81,7 +81,6 @@ async def action_handler__get_query_schemas( def _load_query_object(params: dict) -> "BaseExposedQuery": try: query_obj = FlowmachineQuerySchema().load(params) - query_obj = FlowmachineQuerySchema().load(action_params) query_obj.preflight() # Note that we probably want to remove this call to allow getting qid faster except PreFlightFailedException as exc: orig_error_msg = exc.args[0] @@ -212,6 +211,18 @@ async def action_handler__run_query( ), ), ) + except PreFlightFailedException as exc: + orig_error_msg = exc.args[0] + error_msg = f"Preflight failed for {exc.query_id}." + return ZMQReply( + status="error", + msg=error_msg, + payload={ + "params": action_params, + "orig_error_msg": orig_error_msg, + "errors": exc.errors, + }, + ) except Exception as e: return ZMQReply( status="error", diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index a4aba4c38c..5028006a05 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -10,7 +10,7 @@ """ - +import datetime from typing import List, Optional, Tuple, Union from ...core.context import get_db @@ -66,8 +66,8 @@ class TotalNetworkObjects(GeoDataMixin, Query): def __init__( self, - start=None, - stop=None, + start: Union[str, datetime.date, datetime.datetime], + stop: Union[str, datetime.date, datetime.datetime], *, table="all", total_by="day", diff --git a/flowmachine/tests/server/test_action_handlers.py b/flowmachine/tests/server/test_action_handlers.py index ae7531b804..2297102960 100644 --- a/flowmachine/tests/server/test_action_handlers.py +++ b/flowmachine/tests/server/test_action_handlers.py @@ -187,10 +187,8 @@ async def test_run_query_error_handled(dummy_redis, server_config): ), ) assert msg.status == ZMQReplyStatus.ERROR - assert ( - msg.msg - == "Internal flowmachine server error: could not create query object using query schema. The original error was: 'type object argument after * must be an iterable, not Mock'" - ) + assert msg.msg.rstrip() == "Preflight failed for 992b9767d40a3fe477f3cf642d94f36c." + assert len(msg.errors) == 3 @pytest.mark.parametrize("bad_unit", ["NOT_A_VALID_UNIT", "admin4"]) diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index 8f84d29c94..9931e837a0 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -135,6 +135,8 @@ def test_median_returns_correct_values(get_dataframe): """ instance = AggregateNetworkObjects( total_network_objects=TotalNetworkObjects( + start="2016-01-01", + stop="2016-01-08", table="calls", total_by="hour", network_object=make_spatial_unit("versioned-site"), From 26c964dfe6fcd68d357df3e7330b13d815c7be49 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 5 Oct 2020 13:48:07 +0100 Subject: [PATCH 18/89] Wrap geom table arg in a table --- .../flowmachine/core/server/query_schemas/base_schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py index ca21d6cdba..71a745aa9c 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py +++ b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py @@ -4,7 +4,7 @@ from marshmallow import Schema, post_load -from flowmachine.core import make_spatial_unit +from flowmachine.core import make_spatial_unit, Table class BaseSchema(Schema): @@ -22,7 +22,7 @@ def remove_query_kind_if_present_and_load(self, params, **kwargs): elif "lon-lat" in aggregation_unit_string: spatial_unit_args = { "spatial_unit_type": "lon-lat", - "geom_table": geom_table, + "geom_table": Table(geom_table), "geom_table_join_on": geom_table_join_on, } else: From 6e556ac00d872857ae0057fa032e7eb37fad2954 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 5 Oct 2020 13:48:16 +0100 Subject: [PATCH 19/89] Update tests --- ...lts.test_daily_location_4_sql.approved.txt | 19 +------------------ ...lts.test_daily_location_6_sql.approved.txt | 19 +------------------ .../tests/server/test_action_handlers.py | 2 +- flowmachine/tests/test_daily_location.py | 4 +++- flowmachine/tests/test_dependency_graph.py | 10 ++++------ flowmachine/tests/test_query.py | 4 ++-- ...truction.test_construct_query.approved.txt | 16 ++++++++-------- 7 files changed, 20 insertions(+), 54 deletions(-) diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt index db72f725d8..625e1035cd 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt @@ -30,25 +30,8 @@ FROM (SELECT subscriber_locs.subscriber, loc_table.date_of_last_service, geom_table.admin3pcod AS pcod FROM infrastructure.cells AS loc_table - INNER JOIN (SELECT gid, - admin0name, - admin0pcod, - admin1name, - admin1pcod, - admin2name, - admin2pcod, + INNER JOIN (SELECT admin3pcod, admin3name, - admin3pcod, - admin3refn, - admin3altn, - admin3al_1, - date, - validon, - validto, - shape_star, - shape_stle, - shape_leng, - shape_area, geom FROM geography.admin3) AS geom_table ON st_within(CAST(loc_table.geom_point AS geometry), CAST(st_setsrid(geom_table.geom, 4326) AS geometry))) AS sites ON l.location_id = sites.location_id diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt index e9516ca9a0..16806c0fe7 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt @@ -33,25 +33,8 @@ FROM (SELECT subscriber_locs.subscriber, loc_table.date_of_last_service, geom_table.admin3pcod AS pcod FROM infrastructure.cells AS loc_table - INNER JOIN (SELECT gid, - admin0name, - admin0pcod, - admin1name, - admin1pcod, - admin2name, - admin2pcod, + INNER JOIN (SELECT admin3pcod, admin3name, - admin3pcod, - admin3refn, - admin3altn, - admin3al_1, - date, - validon, - validto, - shape_star, - shape_stle, - shape_leng, - shape_area, geom FROM geography.admin3) AS geom_table ON st_within(CAST(loc_table.geom_point AS geometry), CAST(st_setsrid(geom_table.geom, 4326) AS geometry))) AS sites ON l.location_id = sites.location_id diff --git a/flowmachine/tests/server/test_action_handlers.py b/flowmachine/tests/server/test_action_handlers.py index 2297102960..f5b9582d21 100644 --- a/flowmachine/tests/server/test_action_handlers.py +++ b/flowmachine/tests/server/test_action_handlers.py @@ -188,7 +188,7 @@ async def test_run_query_error_handled(dummy_redis, server_config): ) assert msg.status == ZMQReplyStatus.ERROR assert msg.msg.rstrip() == "Preflight failed for 992b9767d40a3fe477f3cf642d94f36c." - assert len(msg.errors) == 3 + assert len(msg.payload["errors"]) == 3 @pytest.mark.parametrize("bad_unit", ["NOT_A_VALID_UNIT", "admin4"]) diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index f8bdcd40af..f2343f8825 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -81,4 +81,6 @@ def test_daily_locs_errors(): with pytest.raises(PreFlightFailedException) as exc: daily_location("2016-01-31").preflight() with pytest.raises(MissingDateError): - raise exc.value.errors[str(daily_location("2016-01-31"))][0] + raise exc.value.errors[ + "" + ][0] diff --git a/flowmachine/tests/test_dependency_graph.py b/flowmachine/tests/test_dependency_graph.py index 5b665c79c7..767fb0ded9 100644 --- a/flowmachine/tests/test_dependency_graph.py +++ b/flowmachine/tests/test_dependency_graph.py @@ -49,6 +49,8 @@ def test_print_dependency_tree(): expected_output = textwrap.dedent( """\ + - + - - - - @@ -56,16 +58,12 @@ def test_print_dependency_tree(): - - - - - - - + - - - - - - - + - - - - - - - """ ) diff --git a/flowmachine/tests/test_query.py b/flowmachine/tests/test_query.py index 523b824262..8b7d254b9f 100644 --- a/flowmachine/tests/test_query.py +++ b/flowmachine/tests/test_query.py @@ -81,11 +81,11 @@ def test_is_stored(): class storable_query(Query): def _make_query(self): - return """SELECT 1""" + return """SELECT 1 as col""" @property def column_names(self) -> List[str]: - return ["1"] + return ["col"] sq = storable_query() sq.invalidate_db_cache() diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index 5dcfc84cac..0b4dd8694d 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -1,5 +1,5 @@ { - "261b85fc9c64253889174ea8151f1d69": { + "27d2fc28055be78304bc184f6be2aa21": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -20,7 +20,7 @@ } } }, - "e142ffb3174add433422c2724a08c02b": { + "992b9767d40a3fe477f3cf642d94f36c": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -42,7 +42,7 @@ "event_types": null, "subscriber_subset": null }, - "ea5d819b0128448732785dea44bcd866": { + "7658568d3838c1d08b7668f3d64bb867": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "modal_location", @@ -64,11 +64,11 @@ ] } }, - "6521353e7563ed700dfd2cf90721934b": { + "ecff278a078630433a0f54f45ab0bcdc": { "query_kind": "geography", "aggregation_unit": "admin3" }, - "75b0532a747b473f69b45b78fdc29865": { + "37390b99b93c6384b85de61d8386ace3": { "query_kind": "meaningful_locations_aggregate", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -161,7 +161,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "5df56eee4dc96ed961f8eb76583cc6de": { + "fb815e3de0e965554bb17b6940d4c4e2": { "query_kind": "meaningful_locations_between_label_od_matrix", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -256,7 +256,7 @@ "event_types": null, "subscriber_subset": null }, - "00bee92b22b1ceb98244c5700a079656": { + "4c5cb79dd44c3aba255f7048a1054d9b": { "query_kind": "meaningful_locations_between_dates_od_matrix", "aggregation_unit": "admin1", "start_date_a": "2016-01-01", @@ -934,4 +934,4 @@ }, "join_type": "full outer" } -} +} \ No newline at end of file From 93e54e16ffd9d6e15842c31886005a914d913270 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 13:07:54 +0100 Subject: [PATCH 20/89] Update table names in test --- .../flowmachine_tests/test_events_tables_union.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/integration_tests/tests/flowmachine_tests/test_events_tables_union.py b/integration_tests/tests/flowmachine_tests/test_events_tables_union.py index 79c02e628c..7137e2a6c6 100644 --- a/integration_tests/tests/flowmachine_tests/test_events_tables_union.py +++ b/integration_tests/tests/flowmachine_tests/test_events_tables_union.py @@ -15,7 +15,7 @@ def test_events_tables_union_1_sql(diff_reporter): etu = EventsTablesUnion( start="2016-01-02", stop="2016-01-03", - tables=["events.calls"], + tables=["calls"], columns=[ "datetime", "duration", @@ -38,7 +38,7 @@ def test_events_tables_union_1_df(diff_reporter, get_dataframe): etu = EventsTablesUnion( start="2016-01-02", stop="2016-01-03", - tables=["events.calls"], + tables=["calls"], columns=[ "datetime", "duration", @@ -62,7 +62,7 @@ def test_events_tables_union_2_sql(diff_reporter): start="2016-01-03", stop="2016-01-05", hours=(7, 13), - tables=["events.calls"], + tables=["calls"], columns=[ "datetime", "duration", @@ -86,7 +86,7 @@ def test_events_tables_union_2_df(diff_reporter, get_dataframe): start="2016-01-03", stop="2016-01-05", hours=(7, 13), - tables=["events.calls"], + tables=["calls"], columns=[ "datetime", "duration", @@ -110,7 +110,7 @@ def test_events_tables_union_3_sql(diff_reporter): start="2016-01-02", stop="2016-01-04", hours=(21, 5), - tables=["events.calls"], + tables=["calls"], columns=[ "datetime", "duration", @@ -134,7 +134,7 @@ def test_events_tables_union_3_df(diff_reporter, get_dataframe): start="2016-01-02", stop="2016-01-04", hours=(21, 5), - tables=["events.calls"], + tables=["calls"], columns=[ "datetime", "duration", From 3a9b2ae0902ab2967f42d810f0203898d097de82 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 13:12:21 +0100 Subject: [PATCH 21/89] Match original table order --- .../flowmachine/features/utilities/events_tables_union.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index e5d5b752ec..db684d3395 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -85,8 +85,8 @@ def column_names(self) -> List[str]: def _parse_tables(self, tables): if tables is None: return ( - "sms", "calls", + "sms", ) # This should default to all the tables really, but that would break all the tests elif isinstance(tables, str): return [tables] From 85be37c7c5cab98daccb8f7dba30d95624578b66 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 13:13:49 +0100 Subject: [PATCH 22/89] Update passed table in docs --- .../advanced_usage/worked_examples/mobile-data-usage.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/analyst/advanced_usage/worked_examples/mobile-data-usage.ipynb b/docs/source/analyst/advanced_usage/worked_examples/mobile-data-usage.ipynb index 6f07c89a48..5a22a2baaf 100644 --- a/docs/source/analyst/advanced_usage/worked_examples/mobile-data-usage.ipynb +++ b/docs/source/analyst/advanced_usage/worked_examples/mobile-data-usage.ipynb @@ -92,7 +92,7 @@ "data_events_query = flowmachine.features.TotalLocationEvents(\n", " start=\"2016-01-01\",\n", " stop=\"2016-01-08\",\n", - " table=\"events.mds\",\n", + " table=\"mds\",\n", " spatial_unit=make_spatial_unit(\"versioned-cell\"),\n", " interval=\"hour\",\n", ")" From d33ec653c58f0db68e2f77bbb86d6c756510f76b Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 14:12:12 +0100 Subject: [PATCH 23/89] Update table order in dependency graph --- flowmachine/tests/test_dependency_graph.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flowmachine/tests/test_dependency_graph.py b/flowmachine/tests/test_dependency_graph.py index 767fb0ded9..028064edda 100644 --- a/flowmachine/tests/test_dependency_graph.py +++ b/flowmachine/tests/test_dependency_graph.py @@ -55,13 +55,13 @@ def test_print_dependency_tree(): - - - - - - - - - + - + - - - - - + - - - """ From 0045814f1747c65879a1432b439c517de1a9c367 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 14:16:45 +0100 Subject: [PATCH 24/89] Sort provided tables --- .../flowmachine/features/utilities/events_tables_union.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index db684d3395..ac33af5495 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -99,7 +99,7 @@ def _parse_tables(self, tables): elif len(tables) == 0: raise ValueError("Empty tables list.") else: - return tables + return sorted(tables) def _make_table_list(self, *, hours, subscriber_subset, subscriber_identifier): """ From b47d34495fa3d400b7c819c658357df1f7623bb3 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 14:17:01 +0100 Subject: [PATCH 25/89] Update query hashed in approval test --- ..._construction.test_construct_query.approved.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index 0b4dd8694d..a210b4bf68 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -1,5 +1,5 @@ { - "27d2fc28055be78304bc184f6be2aa21": { + "a91de70f2b73901616b6025c04600ae1": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -20,7 +20,7 @@ } } }, - "992b9767d40a3fe477f3cf642d94f36c": { + "d2e36b1c951a96b3a03c31c8531e8bbe": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -32,7 +32,7 @@ "sampling": null } }, - "a2a6a08ad6d6119fffc4cce762c08109": { + "bf6c9cc30d8ac4728c03c7b903766973": { "query_kind": "location_event_counts", "start_date": "2016-01-01", "end_date": "2016-01-02", @@ -42,7 +42,7 @@ "event_types": null, "subscriber_subset": null }, - "7658568d3838c1d08b7668f3d64bb867": { + "5643b75a52a7b0380a9341c318f1043f": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "modal_location", @@ -68,7 +68,7 @@ "query_kind": "geography", "aggregation_unit": "admin3" }, - "37390b99b93c6384b85de61d8386ace3": { + "1f4272f63a286decddf0977cd35a7438": { "query_kind": "meaningful_locations_aggregate", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -161,7 +161,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "fb815e3de0e965554bb17b6940d4c4e2": { + "1b1889cff10cdd99fcb1468c92e99a20": { "query_kind": "meaningful_locations_between_label_od_matrix", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -256,7 +256,7 @@ "event_types": null, "subscriber_subset": null }, - "4c5cb79dd44c3aba255f7048a1054d9b": { + "2a90c6389bb46578b9eb863d132512cf": { "query_kind": "meaningful_locations_between_dates_od_matrix", "aggregation_unit": "admin1", "start_date_a": "2016-01-01", From a86895c2a0cc86d7e641da7d98e1d3ee6286e8f0 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 15:39:35 +0100 Subject: [PATCH 26/89] Fix fast forwarding state machine multiple times, add missing classes to sql-side touch cache --- flowdb/bin/build/0030_utilities.sql | 2 +- flowmachine/flowmachine/core/preflight.py | 2 +- flowmachine/flowmachine/core/table.py | 16 +++++++++++----- flowmachine/tests/test_cache_utils.py | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/flowdb/bin/build/0030_utilities.sql b/flowdb/bin/build/0030_utilities.sql index a89fe4d023..56a4f49fe9 100644 --- a/flowdb/bin/build/0030_utilities.sql +++ b/flowdb/bin/build/0030_utilities.sql @@ -243,7 +243,7 @@ $$ DECLARE score float; BEGIN UPDATE cache.cached SET last_accessed = NOW(), access_count = access_count + 1, - cache_score_multiplier = CASE WHEN class='Table' THEN 0 ELSE + cache_score_multiplier = CASE WHEN class=ANY(ARRAY['Table', 'GeoTable', 'CallsTable', 'SmsTable', 'MdsTable', 'TopupsTable', 'ForwardsTable', 'TacsTable', 'CellsTable', 'SitesTable']) THEN 0 ELSE cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2) END WHERE query_id=cached_query_id diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py index 18cfad3511..125a0c86c0 100644 --- a/flowmachine/flowmachine/core/preflight.py +++ b/flowmachine/flowmachine/core/preflight.py @@ -88,7 +88,7 @@ def preflight(self): try: getattr(dependency, hook)() except Exception as e: - errors.setdefault(str(dependency), []).append(e) + errors.setdefault(str(dependency), list()).append(e) if len(errors) > 0: logger.debug( "Pre-flight failed.", diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 717071893b..0bba2e638c 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -124,11 +124,17 @@ def ff_state_machine(self): get_redis(), self.query_id, get_db().conn_id ) if not q_state_machine.is_completed: - q_state_machine.enqueue() - q_state_machine.execute() - with get_db().engine.begin() as trans: - write_cache_metadata(trans, self, compute_time=0) - q_state_machine.finish() + _, succeeded = q_state_machine.enqueue() + if succeeded: + _, succeeded = q_state_machine.execute() + if succeeded: + with get_db().engine.begin() as trans: + write_cache_metadata(trans, self, compute_time=0) + state, succeeded = q_state_machine.finish() + if not succeeded: + raise RuntimeError( + f"Couldn't fast forward state machine for table {self}. State is: {state}" + ) def __format__(self, fmt): return f"" diff --git a/flowmachine/tests/test_cache_utils.py b/flowmachine/tests/test_cache_utils.py index 24773894ce..fcd7ba931e 100644 --- a/flowmachine/tests/test_cache_utils.py +++ b/flowmachine/tests/test_cache_utils.py @@ -77,8 +77,8 @@ def test_touch_cache_record_for_query(flowmachine_connect): """ Touching a cache record for a query should update access count, last accessed, & counter. """ - initial_touches = get_db().fetch("SELECT nextval('cache.cache_touches');")[0][0] table = daily_location("2016-01-01").store().result() + initial_touches = get_db().fetch("SELECT nextval('cache.cache_touches');")[0][0] assert ( 1 From f809daa5368d45772f6c410b4a3b5e44dc185b9e Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 16:21:14 +0100 Subject: [PATCH 27/89] Match cache half life --- flowmachine/flowmachine/core/cache.py | 39 +++++++++++++++++++++++++++ flowmachine/tests/test_cache_utils.py | 2 +- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index ef12032eff..55b9ae7bc2 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -278,6 +278,7 @@ def write_cache_metadata( logger.debug(f"Can't pickle ({e}), attempting to cache anyway.") pass +<<<<<<< HEAD cache_record_insert = """ INSERT INTO cache.cached (query_id, version, query, created, access_count, last_accessed, compute_time, @@ -309,6 +310,39 @@ def write_cache_metadata( logger.debug(f"{query.fully_qualified_table_name} added to cache.") else: logger.debug(f"Touched cache for {query.fully_qualified_table_name}.") +======= + with con.begin(): + cache_record_insert = """ + INSERT INTO cache.cached + (query_id, version, query, created, access_count, last_accessed, compute_time, + cache_score_multiplier, class, schema, tablename, obj) + VALUES (%s, %s, %s, NOW(), 0, NOW(), %s, 0, %s, %s, %s, %s) + ON CONFLICT (query_id) DO UPDATE SET last_accessed = NOW();""" + con.execute( + cache_record_insert, + ( + query.query_id, + __version__, + query._make_query(), + compute_time, + query.__class__.__name__, + *query.fully_qualified_table_name.split("."), + psycopg2.Binary(self_storage), + ), + ) + logger.debug("Touching cache.", query_id=query.query_id, query=str(query)) + con.execute("SELECT touch_cache(%s);", query.query_id) + + if not in_cache: + for dep in query._get_stored_dependencies(exclude_self=True): + con.execute( + "INSERT INTO cache.dependencies values (%s, %s) ON CONFLICT DO NOTHING", + (query.query_id, dep.query_id), + ) + logger.debug(f"{query.fully_qualified_table_name} added to cache.") + else: + logger.debug(f"Touched cache for {query.fully_qualified_table_name}.") +>>>>>>> 5b45f1582 (Match cache half life) except NotImplementedError: logger.debug("Table has no standard name.") @@ -329,12 +363,17 @@ def touch_cache(connection: "Connection", query_id: str) -> float: The new cache score """ try: +<<<<<<< HEAD with connection.engine.begin() as trans: return float( trans.exec_driver_sql(f"SELECT touch_cache('{query_id}')").fetchall()[ 0 ][0] ) +======= + logger.debug("Touching cache.", query_id=query_id) + return float(connection.fetch(f"SELECT touch_cache('{query_id}')")[0][0]) +>>>>>>> 5b45f1582 (Match cache half life) except (IndexError, psycopg2.InternalError): raise ValueError(f"Query id '{query_id}' is not in cache on this connection.") diff --git a/flowmachine/tests/test_cache_utils.py b/flowmachine/tests/test_cache_utils.py index fcd7ba931e..5f4c771407 100644 --- a/flowmachine/tests/test_cache_utils.py +++ b/flowmachine/tests/test_cache_utils.py @@ -56,7 +56,7 @@ def test_scoring(flowmachine_connect): dl_time = get_compute_time(get_db(), dl.query_id) dl_size = get_size_of_table(get_db(), dl.table_name, "cache") initial_score = get_score(get_db(), dl.query_id) - cachey_scorer = Scorer(halflife=1000.0) + cachey_scorer = Scorer(halflife=get_cache_half_life(get_db())) cache_score = cachey_scorer.touch("dl", dl_time / dl_size) assert cache_score == pytest.approx(initial_score) From 43faa7198fb678a3040ee8e70349c7495ebec985 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 16:25:02 +0100 Subject: [PATCH 28/89] Preflight subs table --- flowmachine/tests/test_subscriber_subsetting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flowmachine/tests/test_subscriber_subsetting.py b/flowmachine/tests/test_subscriber_subsetting.py index 543c764f6e..89a44bcd87 100644 --- a/flowmachine/tests/test_subscriber_subsetting.py +++ b/flowmachine/tests/test_subscriber_subsetting.py @@ -74,6 +74,7 @@ def subscriber_list_table(subscriber_list, flowmachine_connect): ) trans.exec_driver_sql(sql) subs_table = Table("subscriber_list", columns=["subscriber"]) + subs_table.preflight() try: yield subs_table finally: From bc7342eac1a9b5a694708944e7a1bb9861425113 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 12 Oct 2020 16:30:39 +0100 Subject: [PATCH 29/89] Fix dependency graph order in test --- flowmachine/tests/test_dependency_graph.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flowmachine/tests/test_dependency_graph.py b/flowmachine/tests/test_dependency_graph.py index 028064edda..a1641b8f48 100644 --- a/flowmachine/tests/test_dependency_graph.py +++ b/flowmachine/tests/test_dependency_graph.py @@ -52,18 +52,18 @@ def test_print_dependency_tree(): - - - + - + - - - - + - - - - - - - - - - - - - """ ) From 8c7bf495447853358824828f94031afccc323514 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 13 Oct 2020 09:39:24 +0100 Subject: [PATCH 30/89] Update fm tests --- flowmachine/tests/server/test_action_handlers.py | 2 +- flowmachine/tests/test_daily_location.py | 2 +- flowmachine/tests/test_model.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 flowmachine/tests/test_model.py diff --git a/flowmachine/tests/server/test_action_handlers.py b/flowmachine/tests/server/test_action_handlers.py index f5b9582d21..da53cb6282 100644 --- a/flowmachine/tests/server/test_action_handlers.py +++ b/flowmachine/tests/server/test_action_handlers.py @@ -187,7 +187,7 @@ async def test_run_query_error_handled(dummy_redis, server_config): ), ) assert msg.status == ZMQReplyStatus.ERROR - assert msg.msg.rstrip() == "Preflight failed for 992b9767d40a3fe477f3cf642d94f36c." + assert msg.msg.rstrip() == "Preflight failed for d2e36b1c951a96b3a03c31c8531e8bbe." assert len(msg.payload["errors"]) == 3 diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index f2343f8825..a4b4241f44 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -82,5 +82,5 @@ def test_daily_locs_errors(): daily_location("2016-01-31").preflight() with pytest.raises(MissingDateError): raise exc.value.errors[ - "" + "" ][0] diff --git a/flowmachine/tests/test_model.py b/flowmachine/tests/test_model.py new file mode 100644 index 0000000000..e69de29bb2 From d97c76fab90f603c7be481d47bb1ebe78eccec47 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 13 Oct 2020 10:20:35 +0100 Subject: [PATCH 31/89] Fix not checking table arg type and geom table construction, --- .../flowmachine/core/server/action_handlers.py | 14 ++++++++++++-- .../core/server/query_schemas/base_schema.py | 4 +++- .../features/subscriber/contact_balance.py | 4 +++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index 2ba79d1ecb..65dce6c082 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -14,6 +14,7 @@ # action handler and also gracefully handles any potential errors. # import asyncio +import structlog from contextvars import copy_context from functools import partial import json @@ -42,6 +43,9 @@ from ..dependency_graph import query_progress from ..errors.flowmachine_errors import PreFlightFailedException +import traceback + +logger = structlog.get_logger("flowmachine.debug", submodule=__name__) async def action_handler__ping(config: "FlowmachineServerConfig") -> ZMQReply: @@ -106,6 +110,11 @@ def _load_query_object(params: dict) -> "BaseExposedQuery": params, orig_error_msg=orig_error_msg, ) + raise QueryLoadError( + error_msg, + params, + orig_error_msg=orig_error_msg, + ) except ValidationError as exc: # The dictionary of marshmallow errors can contain integers as keys, # which will raise an error when converting to JSON (where the keys @@ -223,11 +232,12 @@ async def action_handler__run_query( "errors": exc.errors, }, ) - except Exception as e: + except Exception as exc: + logger.debug(str(exc), exception=exc, traceback=traceback.format_exc()) return ZMQReply( status="error", msg="Unable to create query object.", - payload={"exception": str(e)}, + payload={"exception": str(exc)}, ) # Register the query as "known" (so that we can later look up the query kind diff --git a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py index 71a745aa9c..bb21fd60cb 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py +++ b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py @@ -22,7 +22,9 @@ def remove_query_kind_if_present_and_load(self, params, **kwargs): elif "lon-lat" in aggregation_unit_string: spatial_unit_args = { "spatial_unit_type": "lon-lat", - "geom_table": Table(geom_table), + "geom_table": Table( + geom_table, columns=[geom_table_join_on, "geom_point"] + ), "geom_table_join_on": geom_table_join_on, } else: diff --git a/flowmachine/flowmachine/features/subscriber/contact_balance.py b/flowmachine/flowmachine/features/subscriber/contact_balance.py index 1a6ca5f44e..7ea3960331 100644 --- a/flowmachine/flowmachine/features/subscriber/contact_balance.py +++ b/flowmachine/flowmachine/features/subscriber/contact_balance.py @@ -77,7 +77,9 @@ def __init__( subscriber_subset=None, ): self.tables = ( - ["calls", "sms"] if tables.lower() == "all" or tables is None else tables + ["calls", "sms"] + if (isinstance(tables, str) and tables.lower() == "all") or tables is None + else tables ) self.start = standardise_date(start) self.stop = standardise_date(stop) From a4e9693524064c28467b0ee83bc9a5e07564488d Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 13 Oct 2020 11:50:14 +0100 Subject: [PATCH 32/89] Handle None table case --- .../flowmachine/core/server/query_schemas/base_schema.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py index bb21fd60cb..0ee6df244e 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py +++ b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py @@ -22,9 +22,9 @@ def remove_query_kind_if_present_and_load(self, params, **kwargs): elif "lon-lat" in aggregation_unit_string: spatial_unit_args = { "spatial_unit_type": "lon-lat", - "geom_table": Table( - geom_table, columns=[geom_table_join_on, "geom_point"] - ), + "geom_table": None + if geom_table is None + else Table(geom_table, columns=[geom_table_join_on, "geom_point"]), "geom_table_join_on": geom_table_join_on, } else: From 7f464ff5a98e8bf21df964dec159abf4153f4ef8 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 16 Oct 2020 15:50:12 +0100 Subject: [PATCH 33/89] Store zero-score cache classes in db --- flowdb/bin/build/0020_schema_cache.sql | 5 ++++- flowdb/bin/build/0030_utilities.sql | 2 +- flowmachine/flowmachine/core/cache.py | 14 +------------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/flowdb/bin/build/0020_schema_cache.sql b/flowdb/bin/build/0020_schema_cache.sql index 0b3bf37bc4..980ba91461 100644 --- a/flowdb/bin/build/0020_schema_cache.sql +++ b/flowdb/bin/build/0020_schema_cache.sql @@ -48,4 +48,7 @@ CREATE TABLE IF NOT EXISTS cache.dependencies CREATE TABLE cache.cache_config (key text, value text); INSERT INTO cache.cache_config (key, value) VALUES ('half_life', NULL); INSERT INTO cache.cache_config (key, value) VALUES ('cache_size', NULL); -INSERT INTO cache.cache_config (key, value) VALUES ('cache_protected_period', NULL); \ No newline at end of file +INSERT INTO cache.cache_config (key, value) VALUES ('cache_protected_period', NULL); + +CREATE TABLE cache.zero_cache (object_class text); +INSERT INTO cache.zero_cache (object_class) VALUES ('Table'), ('GeoTable'), ('CallsTable'), ('SmsTable'), ('MdsTable'), ('TopupsTable'), ('ForwardsTable'), ('TacsTable'), ('CellsTable'), ('SitesTable'); \ No newline at end of file diff --git a/flowdb/bin/build/0030_utilities.sql b/flowdb/bin/build/0030_utilities.sql index 56a4f49fe9..f0926549f3 100644 --- a/flowdb/bin/build/0030_utilities.sql +++ b/flowdb/bin/build/0030_utilities.sql @@ -243,7 +243,7 @@ $$ DECLARE score float; BEGIN UPDATE cache.cached SET last_accessed = NOW(), access_count = access_count + 1, - cache_score_multiplier = CASE WHEN class=ANY(ARRAY['Table', 'GeoTable', 'CallsTable', 'SmsTable', 'MdsTable', 'TopupsTable', 'ForwardsTable', 'TacsTable', 'CellsTable', 'SitesTable']) THEN 0 ELSE + cache_score_multiplier = CASE WHEN class=ANY(array_agg((SELECT object_class FROM cache.cache.zero_cache))) THEN 0 ELSE cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2) END WHERE query_id=cached_query_id diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 55b9ae7bc2..7ba92ee413 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -515,18 +515,6 @@ def get_query_object_by_id(connection: "Connection", query_id: str) -> "Query": raise ValueError(f"Query id '{query_id}' is not in cache on this connection.") -def _get_protected_classes(): - from flowmachine.core.events_table import events_table_map - from flowmachine.core.infrastructure_table import infrastructure_table_map - - return [ - "Table", - "GeoTable", - *[cls.__name__ for cls in events_table_map.values()], - *[cls.__name__ for cls in infrastructure_table_map.values()], - ] - - def get_cached_query_objects_ordered_by_score( connection: "Connection", protected_period: Optional[int] = None, @@ -738,7 +726,7 @@ def get_size_of_cache(connection: "Connection") -> int: """ sql = f"""SELECT sum(table_size(tablename, schema)) as total_bytes FROM cache.cached - WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()}))""" + WHERE NOT (cached.class=ANY(array_agg((SELECT object_class FROM cache.cache.zero_cache))))""" cache_bytes = connection.fetch(sql)[0][0] return 0 if cache_bytes is None else int(cache_bytes) From 07dc063ee0cfd927c4265660cc1096642a6324af Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 16 Oct 2020 16:43:00 +0100 Subject: [PATCH 34/89] Fix table name typo --- flowdb/bin/build/0030_utilities.sql | 2 +- flowmachine/flowmachine/core/cache.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flowdb/bin/build/0030_utilities.sql b/flowdb/bin/build/0030_utilities.sql index f0926549f3..b31d1b6a0c 100644 --- a/flowdb/bin/build/0030_utilities.sql +++ b/flowdb/bin/build/0030_utilities.sql @@ -243,7 +243,7 @@ $$ DECLARE score float; BEGIN UPDATE cache.cached SET last_accessed = NOW(), access_count = access_count + 1, - cache_score_multiplier = CASE WHEN class=ANY(array_agg((SELECT object_class FROM cache.cache.zero_cache))) THEN 0 ELSE + cache_score_multiplier = CASE WHEN class=ANY(array_agg((SELECT object_class FROM cache.zero_cache))) THEN 0 ELSE cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2) END WHERE query_id=cached_query_id diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 7ba92ee413..4d0a8c40b0 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -544,7 +544,7 @@ def get_cached_query_objects_ordered_by_score( ) qry = f"""SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()})) + WHERE NOT (cached.class=ANY(array_agg((SELECT object_class FROM cache.zero_cache)))) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ @@ -726,7 +726,7 @@ def get_size_of_cache(connection: "Connection") -> int: """ sql = f"""SELECT sum(table_size(tablename, schema)) as total_bytes FROM cache.cached - WHERE NOT (cached.class=ANY(array_agg((SELECT object_class FROM cache.cache.zero_cache))))""" + WHERE NOT (cached.class=ANY(array_agg((SELECT object_class FROM cache.zero_cache))))""" cache_bytes = connection.fetch(sql)[0][0] return 0 if cache_bytes is None else int(cache_bytes) From 68b2cc91eebb7a9a380d3e11e3426d079f4276be Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 19 Oct 2020 16:41:52 +0100 Subject: [PATCH 35/89] Syntax for touch_cache --- flowdb/bin/build/0030_utilities.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flowdb/bin/build/0030_utilities.sql b/flowdb/bin/build/0030_utilities.sql index b31d1b6a0c..98217d518e 100644 --- a/flowdb/bin/build/0030_utilities.sql +++ b/flowdb/bin/build/0030_utilities.sql @@ -243,9 +243,10 @@ $$ DECLARE score float; BEGIN UPDATE cache.cached SET last_accessed = NOW(), access_count = access_count + 1, - cache_score_multiplier = CASE WHEN class=ANY(array_agg((SELECT object_class FROM cache.zero_cache))) THEN 0 ELSE + cache_score_multiplier = CASE WHEN class=ANY(no_score.classes) THEN 0 ELSE cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2) END + FROM (SELECT array_agg(object_class) as classes FROM cache.zero_cache) AS no_score WHERE query_id=cached_query_id RETURNING cache_score(cache_score_multiplier, compute_time, greatest(table_size(tablename, schema), 0.00001)) INTO score; IF NOT FOUND THEN RAISE EXCEPTION 'Cache record % not found', cached_query_id; From 9c1305767aa51da19361816cd4a24ff013929499 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 19 Oct 2020 17:00:23 +0100 Subject: [PATCH 36/89] Syntax for sort --- flowmachine/flowmachine/core/cache.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 4d0a8c40b0..faaa5bc222 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -542,9 +542,11 @@ def get_cached_query_objects_ordered_by_score( if protected_period is not None else " AND NOW()-created > (cache_protected_period()*INTERVAL '1 seconds')" ) - qry = f"""SELECT query_id, table_size(tablename, schema) as table_size + qry = f""" + WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) + SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE NOT (cached.class=ANY(array_agg((SELECT object_class FROM cache.zero_cache)))) + WHERE NOT (cached.class=ANY(classes)) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ From b72b207f303832f14bd03619627afa2ee6bf3d66 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 19 Oct 2020 17:20:51 +0100 Subject: [PATCH 37/89] Wrap in subquery --- flowmachine/flowmachine/core/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index faaa5bc222..b5382ef86e 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -546,7 +546,7 @@ def get_cached_query_objects_ordered_by_score( WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE NOT (cached.class=ANY(classes)) + WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ From adadf5bc5fb6f630890f85e20d30d5382163ebc2 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 20 Oct 2020 11:26:01 +0100 Subject: [PATCH 38/89] Missing bracket --- flowmachine/flowmachine/core/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index b5382ef86e..fbd824257d 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -546,7 +546,7 @@ def get_cached_query_objects_ordered_by_score( WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)) + WHERE NOT (cached.class=ANY((SELECT classes FROM no_score))) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ From aef870ed6e4e17e7bd0b83891219132307dd6c39 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 20 Oct 2020 11:52:10 +0100 Subject: [PATCH 39/89] Add missing cast --- flowmachine/flowmachine/core/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index fbd824257d..ca6cfd1370 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -546,7 +546,7 @@ def get_cached_query_objects_ordered_by_score( WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE NOT (cached.class=ANY((SELECT classes FROM no_score))) + WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)::TEXT[])) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ From 1bce844037b0830c2657e668c90d6d65dbe3e46e Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 20 Oct 2020 12:34:15 +0100 Subject: [PATCH 40/89] Use subquery for get size of cache --- flowmachine/flowmachine/core/cache.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index ca6cfd1370..059fa0d30a 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -726,9 +726,11 @@ def get_size_of_cache(connection: "Connection") -> int: Number of bytes in total used by cache tables """ - sql = f"""SELECT sum(table_size(tablename, schema)) as total_bytes + sql = f""" + WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) + SELECT sum(table_size(tablename, schema)) as total_bytes FROM cache.cached - WHERE NOT (cached.class=ANY(array_agg((SELECT object_class FROM cache.zero_cache))))""" + WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)::TEXT[])""" cache_bytes = connection.fetch(sql)[0][0] return 0 if cache_bytes is None else int(cache_bytes) From d6d5920a62837b27a6592751345ebd8f3988ffd6 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 20 Oct 2020 12:50:46 +0100 Subject: [PATCH 41/89] Missed a bracket --- flowmachine/flowmachine/core/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 059fa0d30a..0085a893bc 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -730,7 +730,7 @@ def get_size_of_cache(connection: "Connection") -> int: WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) SELECT sum(table_size(tablename, schema)) as total_bytes FROM cache.cached - WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)::TEXT[])""" + WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)::TEXT[]))""" cache_bytes = connection.fetch(sql)[0][0] return 0 if cache_bytes is None else int(cache_bytes) From 86e6c2287f15ab9e6fbbe07737c18d11d56e31b0 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 20 Oct 2020 13:12:50 +0100 Subject: [PATCH 42/89] Remove zero cache from internal tables in test helper --- integration_tests/tests/flowmachine_server_tests/helpers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integration_tests/tests/flowmachine_server_tests/helpers.py b/integration_tests/tests/flowmachine_server_tests/helpers.py index 1bdd7574e6..e40ec2c41d 100644 --- a/integration_tests/tests/flowmachine_server_tests/helpers.py +++ b/integration_tests/tests/flowmachine_server_tests/helpers.py @@ -42,6 +42,7 @@ def get_cache_tables(fm_conn, exclude_internal_tables=True): cache_tables.remove("cached") cache_tables.remove("dependencies") cache_tables.remove("cache_config") + cache_tables.remove("zero_cache") return sorted(cache_tables) From 1efdd5e16068188ba8f5585f7087e26b7bc35a33 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 20 Oct 2020 13:42:25 +0100 Subject: [PATCH 43/89] Centralise list of internal tables in helper --- .../tests/flowmachine_server_tests/helpers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/integration_tests/tests/flowmachine_server_tests/helpers.py b/integration_tests/tests/flowmachine_server_tests/helpers.py index e40ec2c41d..3b15625866 100644 --- a/integration_tests/tests/flowmachine_server_tests/helpers.py +++ b/integration_tests/tests/flowmachine_server_tests/helpers.py @@ -6,6 +6,8 @@ from flowmachine.core.server.utils import send_zmq_message_and_receive_reply +internal_tables = sorted(["cache_config", "cached", "dependencies", "zero_cache"]) + def poll_until_done(port, query_id, max_tries=100): """ @@ -39,10 +41,8 @@ def get_cache_tables(fm_conn, exclude_internal_tables=True): insp = inspect(fm_conn.engine) cache_tables = insp.get_table_names(schema="cache") if exclude_internal_tables: - cache_tables.remove("cached") - cache_tables.remove("dependencies") - cache_tables.remove("cache_config") - cache_tables.remove("zero_cache") + for table in internal_tables: + cache_tables.remove(table) return sorted(cache_tables) @@ -58,7 +58,7 @@ def cache_schema_is_empty(fm_conn, check_internal_tables_are_empty=True): cache_tables = sorted(insp.get_table_names(schema="cache")) # Check that there are no cached tables except the flowdb-internal ones - if cache_tables != ["cache_config", "cached", "dependencies"]: + if cache_tables != internal_tables: return False if check_internal_tables_are_empty: From c20f4a9fe6814b32744cb9e0edb832195e2a667a Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 22 Oct 2020 16:31:59 +0100 Subject: [PATCH 44/89] Don't call the database from init of RandomSystemRows --- flowmachine/flowmachine/core/random.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/flowmachine/flowmachine/core/random.py b/flowmachine/flowmachine/core/random.py index 7996ed3734..22649f495f 100644 --- a/flowmachine/flowmachine/core/random.py +++ b/flowmachine/flowmachine/core/random.py @@ -9,6 +9,7 @@ from typing import List, Optional, Dict, Any, Union, Type, Tuple from abc import ABCMeta, abstractmethod +from . import preflight from .query import Query from .table import Table @@ -145,18 +146,20 @@ def __init__( fraction: Optional[float] = None, estimate_count: bool = False, ): + super().__init__( + query=query, size=size, fraction=fraction, estimate_count=estimate_count + ) + + @preflight + def check_not_inherited(self): # Raise a value error if the query is a table, and has children, as the # method relies on it not having children. - if isinstance(query, Table) and query.has_children(): + if isinstance(self.query, Table) and self.query.has_children(): raise ValueError( "It is not possible to use the 'system_rows' method in tables with inheritance " + "as it selects a random sample for each child table and not for the set as a whole." ) - super().__init__( - query=query, size=size, fraction=fraction, estimate_count=estimate_count - ) - def _make_query(self) -> str: # TABLESAMPLE only works on tables, so silently store this query self.query.store().result() From 3f8ed902bca88cede5f0c3259d573f5028adef5b Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 22 Oct 2020 16:48:25 +0100 Subject: [PATCH 45/89] Correct import --- flowmachine/flowmachine/core/random.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flowmachine/flowmachine/core/random.py b/flowmachine/flowmachine/core/random.py index 22649f495f..c229d5c3c2 100644 --- a/flowmachine/flowmachine/core/random.py +++ b/flowmachine/flowmachine/core/random.py @@ -6,10 +6,10 @@ Classes to select random samples from queries or tables. """ import random -from typing import List, Optional, Dict, Any, Union, Type, Tuple +from typing import List, Optional, Dict, Any, Type, Tuple from abc import ABCMeta, abstractmethod -from . import preflight +from .preflight import pre_flight from .query import Query from .table import Table @@ -150,7 +150,7 @@ def __init__( query=query, size=size, fraction=fraction, estimate_count=estimate_count ) - @preflight + @pre_flight def check_not_inherited(self): # Raise a value error if the query is a table, and has children, as the # method relies on it not having children. From 96ca0eb698292f6d73e7dd263047f51d8f408d2c Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 22 Oct 2020 16:58:03 +0100 Subject: [PATCH 46/89] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44a8e7132b..385451c3b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -176,6 +176,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). #### Added - `inflows` and `outflows` exposed via API endpoint + added to flowclient [#2029](https://github.com/Flowminder/FlowKit/issues/2029), [#4866](https://github.com/Flowminder/FlowKit/issues/4866) +- Added a new `@pre_flight` decorator which `Query` subclasses may use to indicate a method should be run to confirm the query is runnable +- Added a new `preflight()` method to query, which calls all applicable pre-flight check methods for the query ### Changed - __Action Needed__ Airflow updated to version 2.3.3; **backup flowetl_db before applying update** [#4940](https://github.com/Flowminder/FlowKit/pull/4940) @@ -186,6 +188,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - FlowDB now always creates a role named `flowmachine.` - Flowmachine will set the state of a query being stored to cancelled if interrupted while the store is running. - Flowmachine now supports sqlalchemy >=1.4 [#5140](https://github.com/Flowminder/FlowKit/issues/5140) +- `get_cached_query_objects_ordered_by_score` is now a generator. [#3116](https://github.com/Flowminder/FlowKit/issues/3116) +- Queries should no longer require communication with the database during `__init__`, any checks that require database access must now be implemented as a method of the class and use the `@pre_flight` decorator ### Fixed - Flowmachine now makes the built in `flowmachine` role owner of cache tables as a post-action when a query is `store`d. [#4714](https://github.com/Flowminder/FlowKit/issues/4714) From d9d08e752f2ef439bc6cac867eaf544865a18e5a Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 22 Oct 2020 17:00:38 +0100 Subject: [PATCH 47/89] Move geotable column checks back into init (no db required) --- flowmachine/flowmachine/core/geotable.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/flowmachine/flowmachine/core/geotable.py b/flowmachine/flowmachine/core/geotable.py index e9714ec2b1..cc024f05ed 100644 --- a/flowmachine/flowmachine/core/geotable.py +++ b/flowmachine/flowmachine/core/geotable.py @@ -53,9 +53,6 @@ def __init__( self.geom_column = geom_column self.gid_column = gid_column super().__init__(name=name, schema=schema, columns=columns) - - @pre_flight - def check_geom_column_present(self): if self.geom_column not in self.column_names: raise ValueError( "geom_column: {} is not a column in this table.".format( From c3507a4d8090d81f0c9957ea852ed89598b86bd6 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 22 Oct 2020 17:18:56 +0100 Subject: [PATCH 48/89] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 385451c3b6..05ab4bd1a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -190,6 +190,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Flowmachine now supports sqlalchemy >=1.4 [#5140](https://github.com/Flowminder/FlowKit/issues/5140) - `get_cached_query_objects_ordered_by_score` is now a generator. [#3116](https://github.com/Flowminder/FlowKit/issues/3116) - Queries should no longer require communication with the database during `__init__`, any checks that require database access must now be implemented as a method of the class and use the `@pre_flight` decorator +- When specifying tables in Flowmachine, the `events.` prefix is no longer required. ### Fixed - Flowmachine now makes the built in `flowmachine` role owner of cache tables as a post-action when a query is `store`d. [#4714](https://github.com/Flowminder/FlowKit/issues/4714) @@ -200,6 +201,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Removed - `use_file_flux_sensor` removed entirely. [#2812](https://github.com/Flowminder/FlowKit/issues/2812) - `Model`, `ModelResult` and `Louvain` have been removed. [#5168](https://github.com/Flowminder/FlowKit/issues/5168) +- `Table` no longer automatically infers columns from the database, they must be specified. ## [1.16.0] From 4baea78c5b48b1c520ef58da79480321f0fec97c Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 13 Nov 2020 15:04:03 +0000 Subject: [PATCH 49/89] Handle empty table list --- .../features/utilities/events_tables_union.py | 99 ++++++++++--------- 1 file changed, 52 insertions(+), 47 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index ac33af5495..67901fe685 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -63,12 +63,15 @@ def __init__( self.start = standardise_date(start) self.stop = standardise_date(stop) self.columns = columns - self.tables = self._parse_tables(tables) + self.tables = _parse_tables(tables) if "*" in columns and len(self.tables) != 1: raise ValueError( "Must give named columns when combining multiple event type tables." ) - self.date_subsets = self._make_table_list( + self.date_subsets = _make_table_list( + tables=self.tables, + start=self.start, + stop=self.stop, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=subscriber_identifier, @@ -82,51 +85,6 @@ def column_names(self) -> List[str]: 0 ].column_names # Use in preference to self.columns which might be ["*"] - def _parse_tables(self, tables): - if tables is None: - return ( - "calls", - "sms", - ) # This should default to all the tables really, but that would break all the tests - elif isinstance(tables, str): - return [tables] - elif isinstance(tables, str): - raise ValueError("Empty table name.") - elif not isinstance(tables, list) or not all( - [isinstance(tbl, str) for tbl in tables] - ): - raise ValueError("Tables must be a string or list of strings.") - elif len(tables) == 0: - raise ValueError("Empty tables list.") - else: - return sorted(tables) - - def _make_table_list(self, *, hours, subscriber_subset, subscriber_identifier): - """ - Makes a list of EventTableSubset queries. - """ - - date_subsets = [] - for table in self.tables: - try: - sql = EventTableSubset( - start=self.start, - stop=self.stop, - table=table, - columns=self.columns, - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, - ) - date_subsets.append(sql) - except MissingDateError: - warnings.warn( - f"No data in {table} for {self.start}–{self.stop}", stacklevel=2 - ) - if not date_subsets: - raise MissingDateError(self.start, self.stop) - return date_subsets - def _make_query(self): # Get the list of tables, select the relevant columns and union # them all @@ -138,3 +96,50 @@ def _make_query(self): def fully_qualified_table_name(self): # EventTableSubset are a simple select from events, and should not be cached raise NotImplementedError + + +def _parse_tables(tables): + if tables is None: + return ( + "calls", + "sms", + ) # This should default to all the tables really, but that would break all the tests + elif isinstance(tables, str): + return [tables] + elif isinstance(tables, str): + raise ValueError("Empty table name.") + elif not isinstance(tables, list) or not all( + [isinstance(tbl, str) for tbl in tables] + ): + raise ValueError("Tables must be a string or list of strings.") + elif len(tables) == 0: + raise ValueError("Empty tables list.") + else: + return sorted(tables) + + +def _make_table_list( + *, tables, start, stop, columns, hours, subscriber_subset, subscriber_identifier +): + """ + Makes a list of EventTableSubset queries. + """ + + date_subsets = [] + for table in tables: + try: + sql = EventTableSubset( + start=start, + stop=stop, + table=table, + columns=columns, + hours=hours, + subscriber_subset=subscriber_subset, + subscriber_identifier=subscriber_identifier, + ) + date_subsets.append(sql) + except MissingDateError: + warnings.warn(f"No data in {table} for {start}–{stop}", stacklevel=2) + if not date_subsets: + raise MissingDateError(start, stop) + return date_subsets From 0c6f1252cbe17e055edfce25b2fd5ae72bd5b5a3 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 13 Nov 2020 15:25:38 +0000 Subject: [PATCH 50/89] Add missing arg --- .../flowmachine/features/utilities/events_tables_union.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index 67901fe685..0ded39a80f 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -5,10 +5,9 @@ import structlog import warnings -from typing import List, Union, Optional +from typing import List, Union from ...core import Query -from ...core.context import get_db from ...core.errors import MissingDateError from .event_table_subset import EventTableSubset from flowmachine.utils import standardise_date @@ -72,6 +71,7 @@ def __init__( tables=self.tables, start=self.start, stop=self.stop, + columns=columns, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=subscriber_identifier, From 1b78892a3f02e655549d7aa4f879705cf0ded9c2 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 18 Dec 2020 16:08:33 +0000 Subject: [PATCH 51/89] Fix columns check --- flowmachine/flowmachine/core/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 0bba2e638c..2e281c5346 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -109,7 +109,7 @@ def check_columns(self): logger.debug( "Checking provided columns against db columns.", - provided=columns, + provided=self.columns, db_columns=db_columns, ) if not set(self.columns).issubset(db_columns): From c04821068fa3d6745750ed79c206acdb9608e8c9 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 18 Dec 2020 16:32:25 +0000 Subject: [PATCH 52/89] Clean up table signature a bit --- flowmachine/flowmachine/core/geotable.py | 18 +++++++++++------- flowmachine/flowmachine/core/table.py | 23 +++++++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/flowmachine/flowmachine/core/geotable.py b/flowmachine/flowmachine/core/geotable.py index cc024f05ed..d0009f5746 100644 --- a/flowmachine/flowmachine/core/geotable.py +++ b/flowmachine/flowmachine/core/geotable.py @@ -6,10 +6,10 @@ """ Simple utility class that represents tables with geometry. """ +from typing import Optional, List from . import Table from .mixins import GeoDataMixin -from .preflight import pre_flight class GeoTable(GeoDataMixin, Table): @@ -48,21 +48,25 @@ class GeoTable(GeoDataMixin, Table): """ def __init__( - self, name=None, schema=None, columns=None, geom_column="geom", gid_column=None + self, + name: str, + *, + schema: Optional[str] = None, + columns: List[str], + geom_column: str = "geom", + gid_column: Optional[str] = None, ): self.geom_column = geom_column self.gid_column = gid_column - super().__init__(name=name, schema=schema, columns=columns) if self.geom_column not in self.column_names: raise ValueError( - "geom_column: {} is not a column in this table.".format( - self.geom_column - ) + f"geom_column: {self.geom_column} is not a column in this table." ) if self.gid_column is not None and self.gid_column not in self.column_names: raise ValueError( - "gid_column: {} is not a column in this table.".format(self.gid_column) + f"gid_column: {self.gid_column} is not a column in this table." ) + super().__init__(name=name, schema=schema, columns=columns) def _geo_augmented_query(self): if self.gid_column is None: diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 2e281c5346..fa488984f1 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -39,7 +39,7 @@ class Table(Query): Examples -------- - >>> t = Table(name="calls", schema="events") + >>> t = Table(name="calls", schema="events", olumns=["id", "outgoing", "datetime", "duration"]) >>> t.head() id outgoing datetime duration \ 0 5wNJA-PdRJ4-jxEdG-yOXpZ True 2016-01-01 22:38:06+00:00 3393.0 @@ -63,7 +63,8 @@ class Table(Query): def __init__( self, - name: Optional[str] = None, + name: str, + *, schema: Optional[str] = None, columns: Optional[Iterable[str]] = None, ): @@ -78,9 +79,7 @@ def __init__( self.name = name self.schema = schema - self.fqn = "{}.{}".format(schema, name) if schema else name - if "." not in self.fqn: - raise ValueError("{} is not a valid table.".format(self.fqn)) + self.fqn = f"{schema}.{name}" if schema else name # Record provided columns to ensure that query_id differs with different columns if isinstance(columns, str): # Wrap strings in a list @@ -93,7 +92,7 @@ def __init__( @pre_flight def check_exists(self): if not self.is_stored: - raise ValueError("{} is not a known table.".format(self.fqn)) + raise ValueError(f"{self.fqn} is not a known table.") @pre_flight def check_columns(self): @@ -124,17 +123,17 @@ def ff_state_machine(self): get_redis(), self.query_id, get_db().conn_id ) if not q_state_machine.is_completed: - _, succeeded = q_state_machine.enqueue() + state, succeeded = q_state_machine.enqueue() if succeeded: - _, succeeded = q_state_machine.execute() + state, succeeded = q_state_machine.execute() if succeeded: with get_db().engine.begin() as trans: write_cache_metadata(trans, self, compute_time=0) state, succeeded = q_state_machine.finish() - if not succeeded: - raise RuntimeError( - f"Couldn't fast forward state machine for table {self}. State is: {state}" - ) + if not succeeded: + raise RuntimeError( + f"Couldn't fast forward state machine for table {self}. State is: {state}" + ) def __format__(self, fmt): return f"" From 8c50b3b4d917cf15b7b8b6ac56eca1a07d081673 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 21 Oct 2021 12:44:34 +0100 Subject: [PATCH 53/89] Fixups after rebase --- .../features/utilities/events_tables_union.py | 6 +++--- ...onstruction.test_construct_query.approved.txt | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index 0ded39a80f..b8ccd39e22 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -5,10 +5,10 @@ import structlog import warnings -from typing import List, Union +from typing import List, Union, Optional, Tuple -from ...core import Query -from ...core.errors import MissingDateError +from flowmachine.core import Query +from flowmachine.core.errors import MissingDateError from .event_table_subset import EventTableSubset from flowmachine.utils import standardise_date diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index a210b4bf68..0d8f9bc359 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -1,5 +1,5 @@ { - "a91de70f2b73901616b6025c04600ae1": { + "554956857ccb51c5cb09ccc5ac7312be": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -20,7 +20,7 @@ } } }, - "d2e36b1c951a96b3a03c31c8531e8bbe": { + "fbbefde7546874a97dba31c6da6d23ca": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -32,7 +32,7 @@ "sampling": null } }, - "bf6c9cc30d8ac4728c03c7b903766973": { + "8ba6ff75c60d073c6c45b39a84e906f4": { "query_kind": "location_event_counts", "start_date": "2016-01-01", "end_date": "2016-01-02", @@ -42,7 +42,7 @@ "event_types": null, "subscriber_subset": null }, - "5643b75a52a7b0380a9341c318f1043f": { + "0da3a49e6f8038e21f4db27d0c0fed76": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "modal_location", @@ -68,7 +68,7 @@ "query_kind": "geography", "aggregation_unit": "admin3" }, - "1f4272f63a286decddf0977cd35a7438": { + "f1378a12316fbc2bb01c892235731e52": { "query_kind": "meaningful_locations_aggregate", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -161,7 +161,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "1b1889cff10cdd99fcb1468c92e99a20": { + "2fdd0e9ce17186c8f5f4bd3d5d6516a4": { "query_kind": "meaningful_locations_between_label_od_matrix", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -256,7 +256,7 @@ "event_types": null, "subscriber_subset": null }, - "2a90c6389bb46578b9eb863d132512cf": { + "9560e13665209c7774c2f4bdce67e0ee": { "query_kind": "meaningful_locations_between_dates_od_matrix", "aggregation_unit": "admin1", "start_date_a": "2016-01-01", @@ -934,4 +934,4 @@ }, "join_type": "full outer" } -} \ No newline at end of file +} From 61ed2fd226494b0111046d2262415aba69808b62 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 21 Oct 2021 12:56:58 +0100 Subject: [PATCH 54/89] Fix another rebase change --- .../flowmachine/features/utilities/event_table_subset.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index 3ee4ec197c..da1bac1e29 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -133,11 +133,6 @@ def __init__( ) self.columns = sorted(self.columns) - self.sqlalchemy_table = get_sqlalchemy_table_definition( - self.table_ORIG.fully_qualified_table_name, - engine=get_db().engine, - ) - if self.start == self.stop: raise ValueError("Start and stop are the same.") From 5a6bb11062c752987b7fbef4ef22a3a6c03c2f4a Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 21 Oct 2021 14:00:50 +0100 Subject: [PATCH 55/89] More rebase issues --- flowmachine/tests/server/test_action_handlers.py | 2 +- flowmachine/tests/test_subscriber_location_subset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/server/test_action_handlers.py b/flowmachine/tests/server/test_action_handlers.py index da53cb6282..51d3a6b249 100644 --- a/flowmachine/tests/server/test_action_handlers.py +++ b/flowmachine/tests/server/test_action_handlers.py @@ -187,7 +187,7 @@ async def test_run_query_error_handled(dummy_redis, server_config): ), ) assert msg.status == ZMQReplyStatus.ERROR - assert msg.msg.rstrip() == "Preflight failed for d2e36b1c951a96b3a03c31c8531e8bbe." + assert msg.msg.rstrip() == "Preflight failed for 7b71413efc91213e798ca3bd53107186." assert len(msg.payload["errors"]) == 3 diff --git a/flowmachine/tests/test_subscriber_location_subset.py b/flowmachine/tests/test_subscriber_location_subset.py index 155955c90d..5b759b5bf7 100644 --- a/flowmachine/tests/test_subscriber_location_subset.py +++ b/flowmachine/tests/test_subscriber_location_subset.py @@ -44,7 +44,7 @@ def test_subscribers_who_make_atleast_3_calls_in_central_development_region(): """ start, stop = "2016-01-01", "2016-01-07" regions = Table( - "admin2", "geography", columns=["admin1name", "admin2pcod", "geom"] + "admin2", schema="geography", columns=["admin1name", "admin2pcod", "geom"] ).subset("admin1name", ["Central Development Region"]) sls = SubscriberLocationSubset( From d4a2e85dac2399ef6be793d62a7825f0f0572346 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 21 Oct 2021 14:25:03 +0100 Subject: [PATCH 56/89] Fixup geotable, fast forward even if already fast forwarded --- flowmachine/flowmachine/core/geotable.py | 4 ++-- flowmachine/flowmachine/core/table.py | 13 ++++++------- flowmachine/tests/test_geotable.py | 6 ++++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/flowmachine/flowmachine/core/geotable.py b/flowmachine/flowmachine/core/geotable.py index d0009f5746..9f4dbeeb39 100644 --- a/flowmachine/flowmachine/core/geotable.py +++ b/flowmachine/flowmachine/core/geotable.py @@ -58,11 +58,11 @@ def __init__( ): self.geom_column = geom_column self.gid_column = gid_column - if self.geom_column not in self.column_names: + if self.geom_column not in columns: raise ValueError( f"geom_column: {self.geom_column} is not a column in this table." ) - if self.gid_column is not None and self.gid_column not in self.column_names: + if self.gid_column is not None and self.gid_column not in columns: raise ValueError( f"gid_column: {self.gid_column} is not a column in this table." ) diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index fa488984f1..4a0c45caf6 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -9,7 +9,7 @@ """ from typing import List, Iterable, Optional -from flowmachine.core.query_state import QueryStateMachine +from flowmachine.core.query_state import QueryStateMachine, QueryState from flowmachine.core.context import get_db, get_redis from flowmachine.core.preflight import pre_flight from flowmachine.core.query import Query @@ -124,13 +124,12 @@ def ff_state_machine(self): ) if not q_state_machine.is_completed: state, succeeded = q_state_machine.enqueue() + state, succeeded = q_state_machine.execute() if succeeded: - state, succeeded = q_state_machine.execute() - if succeeded: - with get_db().engine.begin() as trans: - write_cache_metadata(trans, self, compute_time=0) - state, succeeded = q_state_machine.finish() - if not succeeded: + with get_db().engine.begin() as trans: + write_cache_metadata(trans, self, compute_time=0) + state, succeeded = q_state_machine.finish() + if state != QueryState.COMPLETED: raise RuntimeError( f"Couldn't fast forward state machine for table {self}. State is: {state}" ) diff --git a/flowmachine/tests/test_geotable.py b/flowmachine/tests/test_geotable.py index c9fdc493c0..ec981d55a2 100644 --- a/flowmachine/tests/test_geotable.py +++ b/flowmachine/tests/test_geotable.py @@ -6,9 +6,11 @@ def test_geotable_bad_params(): """Test that geotable raises errors correctly.""" with pytest.raises(ValueError): - t = GeoTable("geography.admin3", geom_column="bad_column") + t = GeoTable("geography.admin3", geom_column="bad_column", columns=["geom"]) with pytest.raises(ValueError): - t = GeoTable("geography.admin3", gid_column="bad_column") + t = GeoTable( + "geography.admin3", gid_column="bad_column", columns=["geom", "gid"] + ) def test_geotable(): From 102dbf58e580f6bfc219e5a7207eaf0503cefec4 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 21 Oct 2021 14:44:36 +0100 Subject: [PATCH 57/89] Finish then write meta --- flowmachine/flowmachine/core/table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 4a0c45caf6..9be6dd78f7 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -125,6 +125,7 @@ def ff_state_machine(self): if not q_state_machine.is_completed: state, succeeded = q_state_machine.enqueue() state, succeeded = q_state_machine.execute() + state, succeeded = q_state_machine.finish() if succeeded: with get_db().engine.begin() as trans: write_cache_metadata(trans, self, compute_time=0) From faebea28f81bdb6f5e12f5f3b6c9169c53161e05 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 8 Nov 2021 12:37:30 +0000 Subject: [PATCH 58/89] Supply columns for Table --- flowmachine/tests/test_random.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_random.py b/flowmachine/tests/test_random.py index f4c70339d7..30f69329a3 100644 --- a/flowmachine/tests/test_random.py +++ b/flowmachine/tests/test_random.py @@ -234,7 +234,7 @@ def test_system_rows_fail_with_inheritance(): Test whether the system row method fails if the subscriber queries for random rows on a parent table. """ with pytest.raises(ValueError): - df = Table(name="events.calls").random_sample( + df = Table(name="events.calls", columns=["msisdn"]).random_sample( size=8, sampling_method="system_rows" ) From f906ed0ff7113e619582021e2dab9d8dd1851ed4 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 8 Nov 2021 12:41:48 +0000 Subject: [PATCH 59/89] Fix missing date preflight error test --- flowmachine/tests/test_daily_location.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index a4b4241f44..775f5c2439 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -82,5 +82,9 @@ def test_daily_locs_errors(): daily_location("2016-01-31").preflight() with pytest.raises(MissingDateError): raise exc.value.errors[ - "" + "" + ][0] + with pytest.raises(MissingDateError): + raise exc.value.errors[ + "" ][0] From e2434bc642f6a625456d02522cbf9ecfef4c4e5e Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 8 Nov 2021 12:43:16 +0000 Subject: [PATCH 60/89] Update approval test --- ..._construction.test_construct_query.approved.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index 0d8f9bc359..33e352a27a 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -1,5 +1,5 @@ { - "554956857ccb51c5cb09ccc5ac7312be": { + "1cc95af30e8b456123dab6ae4cc1f451": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -20,7 +20,7 @@ } } }, - "fbbefde7546874a97dba31c6da6d23ca": { + "7b71413efc91213e798ca3bd53107186": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -32,7 +32,7 @@ "sampling": null } }, - "8ba6ff75c60d073c6c45b39a84e906f4": { + "7bc462543d358dedb09a3e2c822b07d2": { "query_kind": "location_event_counts", "start_date": "2016-01-01", "end_date": "2016-01-02", @@ -42,7 +42,7 @@ "event_types": null, "subscriber_subset": null }, - "0da3a49e6f8038e21f4db27d0c0fed76": { + "1944671c4caa2e8adbfb2241df113b98": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "modal_location", @@ -68,7 +68,7 @@ "query_kind": "geography", "aggregation_unit": "admin3" }, - "f1378a12316fbc2bb01c892235731e52": { + "e6edd65d1884739353fc8a8b8bb2982c": { "query_kind": "meaningful_locations_aggregate", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -161,7 +161,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "2fdd0e9ce17186c8f5f4bd3d5d6516a4": { + "0f62ecc4078c84a11e6fd8fd473077a8": { "query_kind": "meaningful_locations_between_label_od_matrix", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -256,7 +256,7 @@ "event_types": null, "subscriber_subset": null }, - "9560e13665209c7774c2f4bdce67e0ee": { + "8513589b559f4b7ae5e4a6a7a283c44d": { "query_kind": "meaningful_locations_between_dates_od_matrix", "aggregation_unit": "admin1", "start_date_a": "2016-01-01", From 23b27284d258e991f0e3ea2369489d82d8fbce02 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 8 Nov 2021 13:06:59 +0000 Subject: [PATCH 61/89] Call preflight to raise error --- flowmachine/tests/test_random.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_random.py b/flowmachine/tests/test_random.py index 30f69329a3..7dd87b605e 100644 --- a/flowmachine/tests/test_random.py +++ b/flowmachine/tests/test_random.py @@ -234,8 +234,10 @@ def test_system_rows_fail_with_inheritance(): Test whether the system row method fails if the subscriber queries for random rows on a parent table. """ with pytest.raises(ValueError): - df = Table(name="events.calls", columns=["msisdn"]).random_sample( - size=8, sampling_method="system_rows" + df = ( + Table(name="events.calls", columns=["msisdn"]) + .random_sample(size=8, sampling_method="system_rows") + .preflight() ) From 8f2d3c5e30026f156d6f5ea930bf7a616110660d Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 16 Jun 2022 12:38:38 +0100 Subject: [PATCH 62/89] Fix some usages of event. and update exceptions, Table usages --- .../flowmachine/core/server/query_schemas/custom_fields.py | 2 +- .../flowmachine/features/subscriber/active_subscribers.py | 4 ++-- .../flowmachine/features/utilities/event_table_subset.py | 2 +- flowmachine/flowmachine/features/utilities/sets.py | 4 +--- flowmachine/tests/test_active_subscribers.py | 6 +++--- flowmachine/tests/test_query_union.py | 2 +- flowmachine/tests/test_random.py | 3 ++- 7 files changed, 11 insertions(+), 12 deletions(-) diff --git a/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py b/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py index 5bda8ba328..f750f6b2ce 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py +++ b/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py @@ -56,7 +56,7 @@ class EventTypes(fields.List): """ A list of strings representing an event type, for example "calls", "sms", "mds", "topups". - When deserialised, will be deduped, and prefixed with "events." + When deserialised, will be deduped. """ def __init__( diff --git a/flowmachine/flowmachine/features/subscriber/active_subscribers.py b/flowmachine/flowmachine/features/subscriber/active_subscribers.py index 2eeeab1159..21506a2372 100644 --- a/flowmachine/flowmachine/features/subscriber/active_subscribers.py +++ b/flowmachine/flowmachine/features/subscriber/active_subscribers.py @@ -80,7 +80,7 @@ class ActiveSubscribers(ExposedDatetimeMixin, Query): total_major_periods=4, minor_period_threshold=1, major_period_threshold=3, - tables=["events.calls"], + tables=["calls"], ) Returns subscribers that were active in at least two ten minute intervals within half an hour, @@ -94,7 +94,7 @@ class ActiveSubscribers(ExposedDatetimeMixin, Query): minor_period_threshold=2, major_period_threshold=3, period_unit="minutes", - tables=["events.calls"], + tables=["calls"], ) diff --git a/flowmachine/flowmachine/features/utilities/event_table_subset.py b/flowmachine/flowmachine/features/utilities/event_table_subset.py index da1bac1e29..0dd47fc753 100644 --- a/flowmachine/flowmachine/features/utilities/event_table_subset.py +++ b/flowmachine/flowmachine/features/utilities/event_table_subset.py @@ -40,7 +40,7 @@ class EventTableSubset(Query): This will subset the query only with these hours, but across all specified days. Or set to 'all' to include all hours. - table : str, default 'events.calls' + table : str, default 'calls' schema qualified name of the table which the analysis is based upon subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' diff --git a/flowmachine/flowmachine/features/utilities/sets.py b/flowmachine/flowmachine/features/utilities/sets.py index 1ff69fc019..a4bacea0c5 100644 --- a/flowmachine/flowmachine/features/utilities/sets.py +++ b/flowmachine/flowmachine/features/utilities/sets.py @@ -46,9 +46,7 @@ class UniqueSubscribers(Query): table : str, default 'all' Table on which to perform the query. By default it will look at ALL tables, which are any tables with subscriber information - in them, specified via subscriber_tables in flowmachine.yml. Otherwise - you need to specify a full table (with a schema) such as - 'events.calls'. + in them, specified via subscriber_tables in flowmachine.yml. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None diff --git a/flowmachine/tests/test_active_subscribers.py b/flowmachine/tests/test_active_subscribers.py index 2f610cc766..58cebaef15 100644 --- a/flowmachine/tests/test_active_subscribers.py +++ b/flowmachine/tests/test_active_subscribers.py @@ -19,7 +19,7 @@ def test_active_subscribers_one_day(get_dataframe): total_major_periods=1, minor_period_threshold=3, major_period_threshold=1, - tables=["events.calls"], + tables=["calls"], ) out = get_dataframe(active_subscribers).iloc[0:5] print(out) @@ -44,7 +44,7 @@ def test_active_subscribers_many_days(get_dataframe): total_major_periods=4, minor_period_threshold=1, major_period_threshold=3, - tables=["events.calls"], + tables=["calls"], ) out = get_dataframe(active_subscribers).iloc[0:5] print(out) @@ -72,7 +72,7 @@ def test_active_subscribers_custom_period(get_dataframe): minor_period_threshold=2, major_period_threshold=3, period_unit="minutes", - tables=["events.calls"], + tables=["calls"], ) assert len(active_subscribers.major_period_queries) == 4 assert active_subscribers.major_period_queries[2].start == "2016-01-01 21:00:00" diff --git a/flowmachine/tests/test_query_union.py b/flowmachine/tests/test_query_union.py index 0e483f74e1..04332e386f 100644 --- a/flowmachine/tests/test_query_union.py +++ b/flowmachine/tests/test_query_union.py @@ -44,7 +44,7 @@ def test_union_raises_with_mismatched_columns(): """ with pytest.raises(ValueError): Table(schema="events", name="calls", columns=["msisdn"]).union( - Table(schema="events", name="calls") + Table(schema="events", name="calls", columns=["id"]) ) diff --git a/flowmachine/tests/test_random.py b/flowmachine/tests/test_random.py index 7dd87b605e..56fbdd159b 100644 --- a/flowmachine/tests/test_random.py +++ b/flowmachine/tests/test_random.py @@ -11,6 +11,7 @@ import pytest import pickle +from flowmachine.core.errors.flowmachine_errors import PreFlightFailedException from flowmachine.core.mixins import GraphMixin from flowmachine.features import daily_location, Flows from flowmachine.features.utilities.sets import UniqueSubscribers @@ -233,7 +234,7 @@ def test_system_rows_fail_with_inheritance(): """ Test whether the system row method fails if the subscriber queries for random rows on a parent table. """ - with pytest.raises(ValueError): + with pytest.raises(PreFlightFailedException): df = ( Table(name="events.calls", columns=["msisdn"]) .random_sample(size=8, sampling_method="system_rows") From 2584008afb23d019aeb274911009b49d0be9694e Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 27 Jun 2022 13:19:26 +0100 Subject: [PATCH 63/89] lint --- flowmachine/flowmachine/core/cache.py | 69 ++++++++------------------- 1 file changed, 21 insertions(+), 48 deletions(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index 0085a893bc..1813132e52 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -199,7 +199,9 @@ def write_query_to_cache( con = connection.engine with con.begin() as trans: try: - plan_time = run_ops_list_and_return_execution_time(query_ddl_ops, trans) + plan_time = run_ops_list_and_return_execution_time( + query_ddl_ops, trans + ) logger.debug("Executed queries.") except Exception as exc: q_state_machine.raise_error() @@ -278,7 +280,6 @@ def write_cache_metadata( logger.debug(f"Can't pickle ({e}), attempting to cache anyway.") pass -<<<<<<< HEAD cache_record_insert = """ INSERT INTO cache.cached (query_id, version, query, created, access_count, last_accessed, compute_time, @@ -297,6 +298,7 @@ def write_cache_metadata( psycopg2.Binary(self_storage), ), ) + logger.debug("Touching cache.", query_id=query.query_id, query=str(query)) connection.exec_driver_sql( "SELECT touch_cache(%(ident)s);", dict(ident=query.query_id) ) @@ -310,39 +312,6 @@ def write_cache_metadata( logger.debug(f"{query.fully_qualified_table_name} added to cache.") else: logger.debug(f"Touched cache for {query.fully_qualified_table_name}.") -======= - with con.begin(): - cache_record_insert = """ - INSERT INTO cache.cached - (query_id, version, query, created, access_count, last_accessed, compute_time, - cache_score_multiplier, class, schema, tablename, obj) - VALUES (%s, %s, %s, NOW(), 0, NOW(), %s, 0, %s, %s, %s, %s) - ON CONFLICT (query_id) DO UPDATE SET last_accessed = NOW();""" - con.execute( - cache_record_insert, - ( - query.query_id, - __version__, - query._make_query(), - compute_time, - query.__class__.__name__, - *query.fully_qualified_table_name.split("."), - psycopg2.Binary(self_storage), - ), - ) - logger.debug("Touching cache.", query_id=query.query_id, query=str(query)) - con.execute("SELECT touch_cache(%s);", query.query_id) - - if not in_cache: - for dep in query._get_stored_dependencies(exclude_self=True): - con.execute( - "INSERT INTO cache.dependencies values (%s, %s) ON CONFLICT DO NOTHING", - (query.query_id, dep.query_id), - ) - logger.debug(f"{query.fully_qualified_table_name} added to cache.") - else: - logger.debug(f"Touched cache for {query.fully_qualified_table_name}.") ->>>>>>> 5b45f1582 (Match cache half life) except NotImplementedError: logger.debug("Table has no standard name.") @@ -363,17 +332,13 @@ def touch_cache(connection: "Connection", query_id: str) -> float: The new cache score """ try: -<<<<<<< HEAD + logger.debug("Touching cache.", query_id=query_id) with connection.engine.begin() as trans: return float( trans.exec_driver_sql(f"SELECT touch_cache('{query_id}')").fetchall()[ 0 ][0] ) -======= - logger.debug("Touching cache.", query_id=query_id) - return float(connection.fetch(f"SELECT touch_cache('{query_id}')")[0][0]) ->>>>>>> 5b45f1582 (Match cache half life) except (IndexError, psycopg2.InternalError): raise ValueError(f"Query id '{query_id}' is not in cache on this connection.") @@ -515,6 +480,18 @@ def get_query_object_by_id(connection: "Connection", query_id: str) -> "Query": raise ValueError(f"Query id '{query_id}' is not in cache on this connection.") +def _get_protected_classes(): + from flowmachine.core.events_table import events_table_map + from flowmachine.core.infrastructure_table import infrastructure_table_map + + return [ + "Table", + "GeoTable", + *[cls.__name__ for cls in events_table_map.values()], + *[cls.__name__ for cls in infrastructure_table_map.values()], + ] + + def get_cached_query_objects_ordered_by_score( connection: "Connection", protected_period: Optional[int] = None, @@ -542,11 +519,9 @@ def get_cached_query_objects_ordered_by_score( if protected_period is not None else " AND NOW()-created > (cache_protected_period()*INTERVAL '1 seconds')" ) - qry = f""" - WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) - SELECT query_id, table_size(tablename, schema) as table_size + qry = f"""SELECT query_id, table_size(tablename, schema) as table_size FROM cache.cached - WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)::TEXT[])) + WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()})) {protected_period_clause} ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC """ @@ -726,11 +701,9 @@ def get_size_of_cache(connection: "Connection") -> int: Number of bytes in total used by cache tables """ - sql = f""" - WITH no_score AS (SELECT array_agg(object_class) as classes FROM cache.zero_cache) - SELECT sum(table_size(tablename, schema)) as total_bytes + sql = f"""SELECT sum(table_size(tablename, schema)) as total_bytes FROM cache.cached - WHERE NOT (cached.class=ANY((SELECT classes FROM no_score)::TEXT[]))""" + WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()}))""" cache_bytes = connection.fetch(sql)[0][0] return 0 if cache_bytes is None else int(cache_bytes) From 6888c06f7186245f86e7babf6aeddfea19625744 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Mon, 27 Jun 2022 13:49:40 +0100 Subject: [PATCH 64/89] Update test_query_object_construction.test_construct_query.approved.txt --- ..._query_object_construction.test_construct_query.approved.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index 33e352a27a..81affbe167 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -32,7 +32,7 @@ "sampling": null } }, - "7bc462543d358dedb09a3e2c822b07d2": { + "f7137ccf1ed6f7ce5d8626f00c29668a": { "query_kind": "location_event_counts", "start_date": "2016-01-01", "end_date": "2016-01-02", From 0b5cf0c1c7a396221ff227bffd6e1a748757115e Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 10:26:25 +0100 Subject: [PATCH 65/89] Make FlowDBTable abstract --- flowmachine/flowmachine/core/flowdb_table.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/flowdb_table.py b/flowmachine/flowmachine/core/flowdb_table.py index fded845ab3..8d88b3cf12 100644 --- a/flowmachine/flowmachine/core/flowdb_table.py +++ b/flowmachine/flowmachine/core/flowdb_table.py @@ -1,7 +1,10 @@ +from abc import ABCMeta + from flowmachine.core.table import Table -class FlowDBTable(Table): + +class FlowDBTable(Table, metaclass=ABCMeta): def __init__(self, *, name, schema, columns): if columns is None: columns = self.all_columns @@ -11,3 +14,7 @@ def __init__(self, *, name, schema, columns): raise ValueError( f"Columns {columns} must be a subset of {self.all_columns}" ) + + @property + def all_columns(self): + raise NotImplementedError From db6c40433b8ffc2b151391a56f7e4b2e64d5a671 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 10:28:31 +0100 Subject: [PATCH 66/89] Cache protected classes --- flowmachine/flowmachine/core/cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py index a7a39b3e4a..5fa80b60f5 100644 --- a/flowmachine/flowmachine/core/cache.py +++ b/flowmachine/flowmachine/core/cache.py @@ -12,7 +12,7 @@ import sqlalchemy.engine from contextvars import copy_context from concurrent.futures import Executor, TimeoutError -from functools import partial +from functools import partial, lru_cache from sqlalchemy.exc import ResourceClosedError from typing import TYPE_CHECKING, Tuple, List, Callable, Optional @@ -487,6 +487,7 @@ def get_query_object_by_id(connection: "Connection", query_id: str) -> "Query": raise ValueError(f"Query id '{query_id}' is not in cache on this connection.") +@lru_cache(maxsize=1) def _get_protected_classes(): from flowmachine.core.events_table import events_table_map from flowmachine.core.infrastructure_table import infrastructure_table_map From 95bf78bcb3f6ae15117659b9ee815ffecf857522 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 10:29:54 +0100 Subject: [PATCH 67/89] Correct doctsring --- flowmachine/flowmachine/core/errors/flowmachine_errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/errors/flowmachine_errors.py b/flowmachine/flowmachine/core/errors/flowmachine_errors.py index ef23a34dbb..41c3605f1d 100644 --- a/flowmachine/flowmachine/core/errors/flowmachine_errors.py +++ b/flowmachine/flowmachine/core/errors/flowmachine_errors.py @@ -17,7 +17,7 @@ class PreFlightFailedException(Exception): ---------- query_id : str Identifier of the query - failures : dict + errors : dict Mapping from query reps to lists of exceptions raised in preflight """ From 8d1eb11d3da03ae28e4d636dec209389228b380d Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 10:33:51 +0100 Subject: [PATCH 68/89] Fix column name typos --- flowmachine/flowmachine/core/events_table.py | 2 +- flowmachine/flowmachine/core/infrastructure_table.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flowmachine/flowmachine/core/events_table.py b/flowmachine/flowmachine/core/events_table.py index bd712b8450..955175f1c4 100644 --- a/flowmachine/flowmachine/core/events_table.py +++ b/flowmachine/flowmachine/core/events_table.py @@ -103,7 +103,7 @@ class TopupsTable(EventsTable): "imsi", "imei", "tac", - "opera" "tor_code", + "operator_code", "country_code", ] diff --git a/flowmachine/flowmachine/core/infrastructure_table.py b/flowmachine/flowmachine/core/infrastructure_table.py index 5501655255..3ed587dbec 100644 --- a/flowmachine/flowmachine/core/infrastructure_table.py +++ b/flowmachine/flowmachine/core/infrastructure_table.py @@ -46,7 +46,7 @@ class CellsTable(InfrastructureTable): "min_range", "electrical_tilt", "mechanical_downtilt", - "date_of_f" "irst_service", + "date_of_first_service", "date_of_last_service", "geom_point", "geom_polygon", @@ -73,7 +73,7 @@ class TacsTable(InfrastructureTable): "mms_built_in_camera", "wap_push_ota_support", "hardware_gprs", - "hardw" "are_edge", + "hardware_edge", "hardware_umts", "hardware_wifi", "hardware_bluetooth", @@ -83,7 +83,7 @@ class TacsTable(InfrastructureTable): "software_os_version", "wap_push_ota_settings", "wap_push_ota_bookmarks", - "wap_push_ota" "_app_internet", + "wap_push_ota_app_internet", "wap_push_ota_app_browser", "wap_push_ota_app_mms", "wap_push_ota_single_shot", @@ -91,7 +91,7 @@ class TacsTable(InfrastructureTable): "wap_push_oma_settings", "wap_push_oma_app_internet", "wap_push_oma_app_browser", - "wap_" "push_oma_app_mms", + "wap_push_oma_app_mms", "wap_push_oma_cp_bookmarks", "wap_1_2_1", "wap_2_0", From 0ce6a5588c55920ac20d9d3bb5e5ee9faac205b5 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 10:34:27 +0100 Subject: [PATCH 69/89] Remove unused imports in preflight --- flowmachine/flowmachine/core/preflight.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py index 125a0c86c0..7bef250790 100644 --- a/flowmachine/flowmachine/core/preflight.py +++ b/flowmachine/flowmachine/core/preflight.py @@ -1,6 +1,5 @@ import inspect from collections import defaultdict -from functools import wraps import typing @@ -8,12 +7,10 @@ import structlog from flowmachine.core.dependency_graph import ( - calculate_dependency_graph, get_dependency_links, _assemble_dependency_graph, ) from flowmachine.core.errors.flowmachine_errors import ( - QueryErroredException, PreFlightFailedException, ) From dbbca00c72a5585a0d7131711e4072b7a1d5580f Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 11:31:18 +0100 Subject: [PATCH 70/89] Check for empty string as a table name --- .../flowmachine/features/utilities/events_tables_union.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index b8ccd39e22..972781dadb 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -104,12 +104,12 @@ def _parse_tables(tables): "calls", "sms", ) # This should default to all the tables really, but that would break all the tests + if tables == "": + raise ValueError("Empty table name.") elif isinstance(tables, str): return [tables] - elif isinstance(tables, str): - raise ValueError("Empty table name.") elif not isinstance(tables, list) or not all( - [isinstance(tbl, str) for tbl in tables] + [isinstance(tbl, str) for tbl in tables] ): raise ValueError("Tables must be a string or list of strings.") elif len(tables) == 0: From a6bfdd8a73d872c1830ca4a06a3d656d52362135 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 11:31:57 +0100 Subject: [PATCH 71/89] Dedupe tables --- .../flowmachine/features/utilities/events_tables_union.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/utilities/events_tables_union.py b/flowmachine/flowmachine/features/utilities/events_tables_union.py index 972781dadb..cf51c7cd13 100644 --- a/flowmachine/flowmachine/features/utilities/events_tables_union.py +++ b/flowmachine/flowmachine/features/utilities/events_tables_union.py @@ -115,7 +115,7 @@ def _parse_tables(tables): elif len(tables) == 0: raise ValueError("Empty tables list.") else: - return sorted(tables) + return sorted(set(tables)) def _make_table_list( From aece3d918ebd24bcb31e2470dc28985841726941 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 11:43:05 +0100 Subject: [PATCH 72/89] Simplify hooks iteration Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- flowmachine/flowmachine/core/preflight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/preflight.py b/flowmachine/flowmachine/core/preflight.py index 7bef250790..94b2168f00 100644 --- a/flowmachine/flowmachine/core/preflight.py +++ b/flowmachine/flowmachine/core/preflight.py @@ -54,7 +54,7 @@ def resolve_hooks(cls) -> typing.Dict[str, typing.List[typing.Callable]]: except AttributeError: pass else: - for key in hook_config.keys(): + for key in hook_config: # Use name here so we can get the bound method later, in # case the processor was a descriptor or something. hooks[key].append(attr_name) From f31793761dc1e0bcd1532e3611cacb9b325d3e60 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 11:57:41 +0100 Subject: [PATCH 73/89] Remove a couple of unused imports Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- flowmachine/flowmachine/core/query.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index d30609f6f9..12617f3c6c 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -31,9 +31,16 @@ ) from flowmachine.core.errors.flowmachine_errors import ( QueryResetFailedException, - QueryErroredException, + QueryResetFailedException, +``` + +Note: The suggestion shows only the relevant line since that's what was provided in the original code snippet. In the actual file, this would be part of a larger import statement: +```python +from flowmachine.core.errors.flowmachine_errors import ( + QueryResetFailedException, +) ) -from flowmachine.core.preflight import resolve_hooks, Preflight +from flowmachine.core.preflight import Preflight from flowmachine.core.query_state import QueryStateMachine from abc import ABCMeta, abstractmethod From 0de3df1dca5b9892084e9782235113c4f1c16462 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 12:03:25 +0100 Subject: [PATCH 74/89] Fix dodgy suggestion --- flowmachine/flowmachine/core/query.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index 12617f3c6c..e9275fafc6 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -29,17 +29,10 @@ get_redis, submit_to_executor, ) -from flowmachine.core.errors.flowmachine_errors import ( - QueryResetFailedException, - QueryResetFailedException, -``` -Note: The suggestion shows only the relevant line since that's what was provided in the original code snippet. In the actual file, this would be part of a larger import statement: -```python from flowmachine.core.errors.flowmachine_errors import ( QueryResetFailedException, ) -) from flowmachine.core.preflight import Preflight from flowmachine.core.query_state import QueryStateMachine from abc import ABCMeta, abstractmethod From 137d88668ad8a8770b60897a5bcf1455880d0598 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 12:04:04 +0100 Subject: [PATCH 75/89] Lint --- flowmachine/flowmachine/core/flowdb_table.py | 1 - .../core/server/query_schemas/base_schema.py | 10 +++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/flowmachine/flowmachine/core/flowdb_table.py b/flowmachine/flowmachine/core/flowdb_table.py index 8d88b3cf12..a6d4d510d5 100644 --- a/flowmachine/flowmachine/core/flowdb_table.py +++ b/flowmachine/flowmachine/core/flowdb_table.py @@ -3,7 +3,6 @@ from flowmachine.core.table import Table - class FlowDBTable(Table, metaclass=ABCMeta): def __init__(self, *, name, schema, columns): if columns is None: diff --git a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py index 0ee6df244e..82f57837b4 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/base_schema.py +++ b/flowmachine/flowmachine/core/server/query_schemas/base_schema.py @@ -22,9 +22,13 @@ def remove_query_kind_if_present_and_load(self, params, **kwargs): elif "lon-lat" in aggregation_unit_string: spatial_unit_args = { "spatial_unit_type": "lon-lat", - "geom_table": None - if geom_table is None - else Table(geom_table, columns=[geom_table_join_on, "geom_point"]), + "geom_table": ( + None + if geom_table is None + else Table( + geom_table, columns=[geom_table_join_on, "geom_point"] + ) + ), "geom_table_join_on": geom_table_join_on, } else: From ddd660dedb6866b3a9128831de5e153605526cb9 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 12:24:43 +0100 Subject: [PATCH 76/89] Call preflight on the query object --- flowmachine/flowmachine/core/server/action_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index 65dce6c082..118ee121a9 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -85,7 +85,7 @@ async def action_handler__get_query_schemas( def _load_query_object(params: dict) -> "BaseExposedQuery": try: query_obj = FlowmachineQuerySchema().load(params) - query_obj.preflight() # Note that we probably want to remove this call to allow getting qid faster + query_obj._flowmachine_query_obj.preflight() # Note that we probably want to remove this call to allow getting qid faster except PreFlightFailedException as exc: orig_error_msg = exc.args[0] error_msg = ( From a57e84335c1965f9355fcb4f3c96b2a4090f2348 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 12:24:56 +0100 Subject: [PATCH 77/89] Update approval tests --- ...lts.test_daily_location_1_sql.approved.txt | 19 +---- ...lts.test_daily_location_2_sql.approved.txt | 17 +--- ...lts.test_daily_location_4_sql.approved.txt | 84 +++++++++---------- ...lts.test_daily_location_6_sql.approved.txt | 19 +---- ...truction.test_construct_query.approved.txt | 18 ++-- 5 files changed, 52 insertions(+), 105 deletions(-) diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt index 3747f837cf..d6f2f54199 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt @@ -27,25 +27,8 @@ FROM (SELECT subscriber, loc_table.date_of_last_service, geom_table.admin3pcod AS pcod FROM infrastructure.cells AS loc_table - INNER JOIN (SELECT gid, - admin0name, - admin0pcod, - admin1name, - admin1pcod, - admin2name, - admin2pcod, + INNER JOIN (SELECT admin3pcod, admin3name, - admin3pcod, - admin3refn, - admin3altn, - admin3al_1, - date, - validon, - validto, - shape_star, - shape_stle, - shape_leng, - shape_area, geom FROM geography.admin3) AS geom_table ON st_within(CAST(loc_table.geom_point AS geometry), CAST(st_setsrid(geom_table.geom, 4326) AS geometry))) AS sites ON l.location_id = sites.location_id diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt index 9b2e96744b..c735dbb079 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt @@ -42,23 +42,8 @@ FROM (SELECT times_visited.subscriber, loc_table.date_of_last_service, geom_table.admin2pcod AS pcod FROM infrastructure.cells AS loc_table - INNER JOIN (SELECT gid, - admin0name, - admin0pcod, - admin1name, - admin1pcod, + INNER JOIN (SELECT admin2pcod, admin2name, - admin2pcod, - admin2refn, - admin2altn, - admin2al_1, - date, - validon, - validto, - shape_star, - shape_stle, - shape_leng, - shape_area, geom FROM geography.admin2) AS geom_table ON st_within(CAST(loc_table.geom_point AS geometry), CAST(st_setsrid(geom_table.geom, 4326) AS geometry))) AS sites ON l.location_id = sites.location_id diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt index 625e1035cd..291ab04249 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt @@ -1,44 +1,40 @@ -SELECT final_time.subscriber, - pcod -FROM (SELECT subscriber_locs.subscriber, - time, - pcod, - row_number() OVER (PARTITION BY subscriber_locs.subscriber - ORDER BY time DESC) AS rank - FROM (SELECT subscriber, - datetime AS time, - pcod - FROM (SELECT l.datetime, - l.location_id, - l.subscriber, - sites.pcod - FROM (SELECT tbl.datetime, - tbl.location_id, - tbl.subscriber - FROM (SELECT events.calls.datetime AS datetime, - events.calls.location_id AS location_id, - events.calls.msisdn AS subscriber - FROM events.calls - WHERE events.calls.datetime >= '2016-01-05 00:00:00' - AND events.calls.datetime < '2016-01-06 00:00:00' - AND (to_char(events.calls.datetime, 'HH24:MI') < '06:00' - OR to_char(events.calls.datetime, 'HH24:MI') >= '22:00')) AS tbl - INNER JOIN (SELECT * - FROM ((VALUES ('dr9xNYK006wykgXj'))) AS tmp (subscriber)) AS subset_query ON tbl.subscriber = subset_query.subscriber) AS l - INNER JOIN (SELECT loc_table.id AS location_id, - loc_table.date_of_first_service, - loc_table.date_of_last_service, - geom_table.admin3pcod AS pcod - FROM infrastructure.cells AS loc_table - INNER JOIN (SELECT admin3pcod, - admin3name, - geom - FROM geography.admin3) AS geom_table ON st_within(CAST(loc_table.geom_point AS geometry), - CAST(st_setsrid(geom_table.geom, 4326) AS geometry))) AS sites ON l.location_id = sites.location_id - AND CAST(l.datetime AS date) BETWEEN COALESCE(sites.date_of_first_service, - CAST('-infinity' AS timestamptz)) - AND COALESCE(sites.date_of_last_service, - CAST('infinity' AS timestamptz))) AS foo - WHERE location_id IS NOT NULL - AND location_id <> '') AS subscriber_locs) AS final_time -WHERE rank = 1 +SELECT DISTINCT ON (subscriber_locs.subscriber) subscriber_locs.subscriber, + pcod +FROM (SELECT subscriber, + datetime AS time, + pcod + FROM (SELECT l.datetime, + l.location_id, + l.subscriber, + sites.pcod + FROM (SELECT tbl.datetime, + tbl.location_id, + tbl.subscriber + FROM (SELECT events.calls.datetime AS datetime, + events.calls.location_id AS location_id, + events.calls.msisdn AS subscriber + FROM events.calls + WHERE events.calls.datetime >= '2016-01-05 00:00:00' + AND events.calls.datetime < '2016-01-06 00:00:00' + AND (to_char(events.calls.datetime, 'HH24:MI') < '06:00' + OR to_char(events.calls.datetime, 'HH24:MI') >= '22:00')) AS tbl + INNER JOIN (SELECT * + FROM ((VALUES ('dr9xNYK006wykgXj'))) AS tmp (subscriber)) AS subset_query ON tbl.subscriber = subset_query.subscriber) AS l + INNER JOIN (SELECT loc_table.id AS location_id, + loc_table.date_of_first_service, + loc_table.date_of_last_service, + geom_table.admin3pcod AS pcod + FROM infrastructure.cells AS loc_table + INNER JOIN (SELECT admin3pcod, + admin3name, + geom + FROM geography.admin3) AS geom_table ON st_within(CAST(loc_table.geom_point AS geometry), + CAST(st_setsrid(geom_table.geom, 4326) AS geometry))) AS sites ON l.location_id = sites.location_id + AND CAST(l.datetime AS date) BETWEEN COALESCE(sites.date_of_first_service, + CAST('-infinity' AS timestamptz)) + AND COALESCE(sites.date_of_last_service, + CAST('infinity' AS timestamptz))) AS foo + WHERE location_id IS NOT NULL + AND location_id <> '') AS subscriber_locs +ORDER BY subscriber_locs.subscriber, + time DESC diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt index 4d7fb8cb73..8d7f2008ce 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt @@ -28,25 +28,8 @@ FROM (SELECT subscriber, loc_table.date_of_last_service, geom_table.admin3pcod AS pcod FROM infrastructure.cells AS loc_table - INNER JOIN (SELECT gid, - admin0name, - admin0pcod, - admin1name, - admin1pcod, - admin2name, - admin2pcod, + INNER JOIN (SELECT admin3pcod, admin3name, - admin3pcod, - admin3refn, - admin3altn, - admin3al_1, - date, - validon, - validto, - shape_star, - shape_stle, - shape_leng, - shape_area, geom FROM geography.admin3) AS geom_table ON st_within(CAST(loc_table.geom_point AS geometry), CAST(st_setsrid(geom_table.geom, 4326) AS geometry))) AS sites ON l.location_id = sites.location_id diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index d003b6b3dd..44848428d5 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -32,7 +32,7 @@ "sampling": null } }, - "f7137ccf1ed6f7ce5d8626f00c29668a": { + "0b3536e28692e84e84ba98c64df38280": { "query_kind": "location_event_counts", "start_date": "2016-01-01", "end_date": "2016-01-02", @@ -42,7 +42,7 @@ "event_types": null, "subscriber_subset": null }, - "1944671c4caa2e8adbfb2241df113b98": { + "2b9d575accc718b8fba71457ea8acabf": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "modal_location", @@ -68,7 +68,7 @@ "query_kind": "geography", "aggregation_unit": "admin3" }, - "e6edd65d1884739353fc8a8b8bb2982c": { + "05f6769184c4ad4571ec111b1a7b3029": { "query_kind": "meaningful_locations_aggregate", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -161,7 +161,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "0f62ecc4078c84a11e6fd8fd473077a8": { + "d7192263d6ac8f3826ad3d9edd92a255": { "query_kind": "meaningful_locations_between_label_od_matrix", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -256,7 +256,7 @@ "event_types": null, "subscriber_subset": null }, - "8513589b559f4b7ae5e4a6a7a283c44d": { + "269b9d1f4ec72727ff2f9ced0027d356": { "query_kind": "meaningful_locations_between_dates_od_matrix", "aggregation_unit": "admin1", "start_date_a": "2016-01-01", @@ -355,7 +355,7 @@ ], "subscriber_subset": null }, - "c3dc95da89aeb0908b9398918dab6f26": { + "6f7a3eb1b622ae11f4fe8b72ed567068": { "query_kind": "flows", "from_location": { "query_kind": "daily_location", @@ -371,7 +371,7 @@ }, "join_type": "left outer" }, - "95fbc18554e15733df47bfd5cbaa3f87": { + "3b0cfae8f25961e166d5659130caa2fb": { "query_kind": "flows", "from_location": { "query_kind": "majority_location", @@ -418,7 +418,7 @@ } } }, - "920dfdf5568d75921c4173da8bccc6ef": { + "4e93b642dc8656bc73e116d17f362ce9": { "query_kind": "labelled_spatial_aggregate", "locations": { "query_kind": "coalesced_location", @@ -640,7 +640,7 @@ "stay_length_threshold": 2 } }, - "c915d87b66df83904634990e2f78da9e": { + "a18bd5d93c83c7da665815a8b255f36c": { "query_kind": "labelled_flows", "from_location": { "query_kind": "coalesced_location", From 8a09e6c019d50fb0b6527f85e8924856184ce447 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 12:25:17 +0100 Subject: [PATCH 78/89] Need to actually depend on the nested query to force it to store first --- flowmachine/tests/test_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_cache.py b/flowmachine/tests/test_cache.py index 9222daf930..b49e499cd4 100644 --- a/flowmachine/tests/test_cache.py +++ b/flowmachine/tests/test_cache.py @@ -313,7 +313,7 @@ def column_names(self): return ["value"] def _make_query(self): - return "select 1 as value" + return self.nested.get_query() q = NestTestQuery() q_id = q.query_id From 37de99e8f5bff1d6fcdd817316d573d9531177be Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 12:25:26 +0100 Subject: [PATCH 79/89] Update table param --- flowmachine/tests/test_redacted_total_events.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_redacted_total_events.py b/flowmachine/tests/test_redacted_total_events.py index 45638cd7d6..aaf2a0b189 100644 --- a/flowmachine/tests/test_redacted_total_events.py +++ b/flowmachine/tests/test_redacted_total_events.py @@ -49,7 +49,7 @@ def test_all_above_threshold_hour_bucket(get_dataframe): "2016-01-02", spatial_unit=make_spatial_unit("cell"), interval="hour", - table=["events.calls"], + table=["calls"], ) ) @@ -67,7 +67,7 @@ def test_all_above_threshold_minute_bucket(get_dataframe): "2016-01-01 13:00", spatial_unit=make_spatial_unit("cell"), interval="min", - table=["events.calls"], + table=["calls"], ) ) From 1d7730823dadb9d25ce57a10b86a9a4011361ab3 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 12:27:36 +0100 Subject: [PATCH 80/89] Add missing columns arguments --- .../tests/test_union_with_fixed_values.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/flowmachine/tests/test_union_with_fixed_values.py b/flowmachine/tests/test_union_with_fixed_values.py index f5bdc1c3e7..0d651e3b99 100644 --- a/flowmachine/tests/test_union_with_fixed_values.py +++ b/flowmachine/tests/test_union_with_fixed_values.py @@ -13,19 +13,25 @@ def test_union_column_names(): """Test that Union's column_names property is accurate""" union = UnionWithFixedValues( - [Table("events.calls_20160101"), Table("events.calls_20160102")], + [ + Table("events.calls_20160101", columns=["msisdn"]), + Table("events.calls_20160102", columns=["msisdn"]), + ], ["extra_val", "extra_val"], fixed_value_column_name="extra_col", ) assert union.head(0).columns.tolist() == union.column_names - assert union.column_names == [*Table("events.calls_20160101").columns, "extra_col"] + assert union.column_names == [ + *Table("events.calls_20160101", columns=["msisdn"]).columns, + "extra_col", + ] def test_union_all(get_dataframe): """ Test default union behaviour keeps duplicates. """ - q1 = Table(schema="events", name="calls") + q1 = Table(schema="events", name="calls", columns=["id"]) union_all = q1.union(q1) union_all_df = get_dataframe(union_all) single_id = union_all_df[union_all_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"] @@ -37,7 +43,10 @@ def test_union(get_dataframe): Test union adds extra columns. """ union = UnionWithFixedValues( - [Table("events.calls_20160101"), Table("events.calls_20160101")], + [ + Table("events.calls_20160101", columns=["msisdn"]), + Table("events.calls_20160101", columns=["msisdn"]), + ], ["extra_val", "extra_val_1"], fixed_value_column_name="extra_col", ) @@ -51,7 +60,10 @@ def test_union_date_type(get_dataframe): Test union casts types correctly for datetimes. """ union = UnionWithFixedValues( - [Table("events.calls_20160101"), Table("events.calls_20160101")], + [ + Table("events.calls_20160101", columns=["msisdn"]), + Table("events.calls_20160101", columns=["msisdn"]), + ], [datetime.datetime(2016, 1, 1), datetime.datetime(2016, 1, 2)], fixed_value_column_name="extra_col", ) From 52c0bdcb19a4ef912740911339cf860a363a3e98 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 13:14:00 +0100 Subject: [PATCH 81/89] Pass up preflight errors, fix error test The order that the preflight errors are in is nondeterministic for dependencies of the same level in the tree --- flowmachine/flowmachine/core/server/action_handlers.py | 9 +-------- flowmachine/tests/server/test_action_handlers.py | 4 +++- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index 118ee121a9..116416eea6 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -93,9 +93,7 @@ def _load_query_object(params: dict) -> "BaseExposedQuery": f"The original error was: '{orig_error_msg}'" ) raise QueryLoadError( - error_msg, - params, - orig_error_msg=orig_error_msg, + error_msg, params, orig_error_msg=orig_error_msg, errors=exc.errors ) except TypeError as exc: # We need to catch TypeError here, otherwise they propagate up to @@ -110,11 +108,6 @@ def _load_query_object(params: dict) -> "BaseExposedQuery": params, orig_error_msg=orig_error_msg, ) - raise QueryLoadError( - error_msg, - params, - orig_error_msg=orig_error_msg, - ) except ValidationError as exc: # The dictionary of marshmallow errors can contain integers as keys, # which will raise an error when converting to JSON (where the keys diff --git a/flowmachine/tests/server/test_action_handlers.py b/flowmachine/tests/server/test_action_handlers.py index 51d3a6b249..bf224b5b2e 100644 --- a/flowmachine/tests/server/test_action_handlers.py +++ b/flowmachine/tests/server/test_action_handlers.py @@ -187,7 +187,9 @@ async def test_run_query_error_handled(dummy_redis, server_config): ), ) assert msg.status == ZMQReplyStatus.ERROR - assert msg.msg.rstrip() == "Preflight failed for 7b71413efc91213e798ca3bd53107186." + assert msg.msg.rstrip().startswith( + "Internal flowmachine server error: could not create query object using query schema. The original error was: 'Pre-flight failed for '7b71413efc91213e798ca3bd53107186'. Errors:" + ) assert len(msg.payload["errors"]) == 3 From 302abc7e2dd204f54ca92983e9dac2a19b96cd27 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 13:24:29 +0100 Subject: [PATCH 82/89] Fix typo in example Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- flowmachine/flowmachine/core/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/table.py b/flowmachine/flowmachine/core/table.py index 9be6dd78f7..db03cf3968 100644 --- a/flowmachine/flowmachine/core/table.py +++ b/flowmachine/flowmachine/core/table.py @@ -39,7 +39,7 @@ class Table(Query): Examples -------- - >>> t = Table(name="calls", schema="events", olumns=["id", "outgoing", "datetime", "duration"]) + >>> t = Table(name="calls", schema="events", columns=["id", "outgoing", "datetime", "duration"]) >>> t.head() id outgoing datetime duration \ 0 5wNJA-PdRJ4-jxEdG-yOXpZ True 2016-01-01 22:38:06+00:00 3393.0 From e1510e07f6575dece852cf4c7a7bdb4b36bf6027 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 13:47:20 +0100 Subject: [PATCH 83/89] Log at error level for exception --- flowmachine/flowmachine/core/server/action_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index 116416eea6..07ba567efb 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -226,7 +226,7 @@ async def action_handler__run_query( }, ) except Exception as exc: - logger.debug(str(exc), exception=exc, traceback=traceback.format_exc()) + logger.error(str(exc), exception=exc, traceback=traceback.format_exc()) return ZMQReply( status="error", msg="Unable to create query object.", From 267fac58ec6931ac2e16525613e746b9a043e0c5 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 13:50:07 +0100 Subject: [PATCH 84/89] Add missing mpl headers --- flowmachine/flowmachine/core/events_table.py | 4 ++++ flowmachine/flowmachine/core/flowdb_table.py | 17 ++++++++++++++++- .../flowmachine/core/infrastructure_table.py | 4 ++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/events_table.py b/flowmachine/flowmachine/core/events_table.py index 955175f1c4..c51e5d684a 100644 --- a/flowmachine/flowmachine/core/events_table.py +++ b/flowmachine/flowmachine/core/events_table.py @@ -1,3 +1,7 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + from flowmachine.core.flowdb_table import FlowDBTable diff --git a/flowmachine/flowmachine/core/flowdb_table.py b/flowmachine/flowmachine/core/flowdb_table.py index a6d4d510d5..e98cdb5767 100644 --- a/flowmachine/flowmachine/core/flowdb_table.py +++ b/flowmachine/flowmachine/core/flowdb_table.py @@ -1,10 +1,25 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + from abc import ABCMeta +from typing import Optional from flowmachine.core.table import Table class FlowDBTable(Table, metaclass=ABCMeta): - def __init__(self, *, name, schema, columns): + """ + Abstract base class for fixed tables that exist in FlowDB. + + Parameters + ---------- + name : str + schema : str + columns : list of str + """ + + def __init__(self, *, name: str, schema:str , columns:Optional[list[str]]) -> None: if columns is None: columns = self.all_columns if set(columns).issubset(self.all_columns): diff --git a/flowmachine/flowmachine/core/infrastructure_table.py b/flowmachine/flowmachine/core/infrastructure_table.py index 3ed587dbec..31eeb8a56f 100644 --- a/flowmachine/flowmachine/core/infrastructure_table.py +++ b/flowmachine/flowmachine/core/infrastructure_table.py @@ -1,3 +1,7 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + from flowmachine.core.flowdb_table import FlowDBTable From fea6c74edf6cf0c5b5c24ad09b186791c6492067 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 13:57:21 +0100 Subject: [PATCH 85/89] Delete flowmachine/tests/test_model.py --- flowmachine/tests/test_model.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 flowmachine/tests/test_model.py diff --git a/flowmachine/tests/test_model.py b/flowmachine/tests/test_model.py deleted file mode 100644 index e69de29bb2..0000000000 From 489c94eb713e60b560db2a39380d40a756980aa7 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 13:58:36 +0100 Subject: [PATCH 86/89] Correct test name --- flowmachine/tests/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_table.py b/flowmachine/tests/test_table.py index d905e0eb19..f2e7a865cc 100644 --- a/flowmachine/tests/test_table.py +++ b/flowmachine/tests/test_table.py @@ -45,7 +45,7 @@ def test_table_preflight(args): Table(**args).preflight() -def public_schema_checked(): +def test_public_schema_checked(): """Test that where no schema is provided, public schema is checked.""" t = Table("gambia_admin2", columns=["geom"]).preflight() From 22fd099d8a7db7cec0373a19580e912998a6cb94 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 23 Oct 2024 14:02:59 +0100 Subject: [PATCH 87/89] Add missing columns arg --- flowmachine/tests/test_cache_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_cache_utils.py b/flowmachine/tests/test_cache_utils.py index 5f4c771407..6596a26ac7 100644 --- a/flowmachine/tests/test_cache_utils.py +++ b/flowmachine/tests/test_cache_utils.py @@ -113,7 +113,7 @@ def test_touch_cache_record_for_table(flowmachine_connect): """ Touching a cache record for a table should update access count and last accessed but not touch score, or counter. """ - table = Table("events.calls_20160101") + table = Table("events.calls_20160101", columns=["id"]) table.preflight() with get_db().engine.begin() as conn: conn.exec_driver_sql( From 9a6073271406ce6802f12179b4ce70e53b26bbbf Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 24 Oct 2024 10:19:01 +0100 Subject: [PATCH 88/89] Update default schema test --- flowmachine/tests/test_table.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_table.py b/flowmachine/tests/test_table.py index f2e7a865cc..8f8396e7bc 100644 --- a/flowmachine/tests/test_table.py +++ b/flowmachine/tests/test_table.py @@ -46,8 +46,9 @@ def test_table_preflight(args): def test_public_schema_checked(): - """Test that where no schema is provided, public schema is checked.""" - t = Table("gambia_admin2", columns=["geom"]).preflight() + """Test that where no schema is provided, user schema is checked.""" + t = Table("gambia_admin2", columns=["geom"]) + assert "flowmachine" == t.schema def test_children(): From bb04fb8a77bb4a617bf0bb111bf0d8b71eedc2f8 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Thu, 24 Oct 2024 10:53:54 +0100 Subject: [PATCH 89/89] Types for tables --- flowmachine/flowmachine/core/events_table.py | 13 +++++++------ flowmachine/flowmachine/core/flowdb_table.py | 2 +- .../flowmachine/core/infrastructure_table.py | 9 +++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/flowmachine/flowmachine/core/events_table.py b/flowmachine/flowmachine/core/events_table.py index c51e5d684a..f1f5543dc8 100644 --- a/flowmachine/flowmachine/core/events_table.py +++ b/flowmachine/flowmachine/core/events_table.py @@ -1,12 +1,13 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from typing import Optional from flowmachine.core.flowdb_table import FlowDBTable class EventsTable(FlowDBTable): - def __init__(self, *, name, columns): + def __init__(self, *, name, columns: Optional[list[str]] = None) -> None: super().__init__(schema="events", name=name, columns=columns) @@ -27,7 +28,7 @@ class CallsTable(EventsTable): "country_code", ] - def __init__(self, *, columns=None): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="calls", columns=columns) @@ -47,7 +48,7 @@ class ForwardsTable(EventsTable): "country_code", ] - def __init__(self, *, columns=None): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="forwards", columns=columns) @@ -67,7 +68,7 @@ class SmsTable(EventsTable): "country_code", ] - def __init__(self, *, columns): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="sms", columns=columns) @@ -88,7 +89,7 @@ class MdsTable(EventsTable): "country_code", ] - def __init__(self, *, columns): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="mds", columns=columns) @@ -111,7 +112,7 @@ class TopupsTable(EventsTable): "country_code", ] - def __init__(self, *, columns): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="topups", columns=columns) diff --git a/flowmachine/flowmachine/core/flowdb_table.py b/flowmachine/flowmachine/core/flowdb_table.py index e98cdb5767..d8f3000335 100644 --- a/flowmachine/flowmachine/core/flowdb_table.py +++ b/flowmachine/flowmachine/core/flowdb_table.py @@ -19,7 +19,7 @@ class FlowDBTable(Table, metaclass=ABCMeta): columns : list of str """ - def __init__(self, *, name: str, schema:str , columns:Optional[list[str]]) -> None: + def __init__(self, *, name: str, schema: str, columns: Optional[list[str]]) -> None: if columns is None: columns = self.all_columns if set(columns).issubset(self.all_columns): diff --git a/flowmachine/flowmachine/core/infrastructure_table.py b/flowmachine/flowmachine/core/infrastructure_table.py index 31eeb8a56f..0f723710fe 100644 --- a/flowmachine/flowmachine/core/infrastructure_table.py +++ b/flowmachine/flowmachine/core/infrastructure_table.py @@ -1,12 +1,13 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from typing import Optional from flowmachine.core.flowdb_table import FlowDBTable class InfrastructureTable(FlowDBTable): - def __init__(self, *, name, columns): + def __init__(self, *, name: str, columns: Optional[list[str]] = None) -> None: super().__init__(schema="infrastructure", name=name, columns=columns) @@ -26,7 +27,7 @@ class SitesTable(InfrastructureTable): "geom_polygon", ] - def __init__(self, *, columns=None): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="sites", columns=columns) @@ -56,7 +57,7 @@ class CellsTable(InfrastructureTable): "geom_polygon", ] - def __init__(self, *, columns=None): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="cells", columns=columns) @@ -117,7 +118,7 @@ class TacsTable(InfrastructureTable): "hnd_type", ] - def __init__(self, *, columns=None): + def __init__(self, *, columns: Optional[list[str]] = None) -> None: super().__init__(name="tacs", columns=columns)