From 4a06d0883258e8bd5e5f46481e1bce05b5f336de Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 24 Jul 2020 18:07:04 +0100 Subject: [PATCH 1/3] Framework of 'mobility indicators' worked example --- mobility-indicators.ipynb | 2268 +++++++++++++++++++++++++++++++++++++ 1 file changed, 2268 insertions(+) create mode 100644 mobility-indicators.ipynb diff --git a/mobility-indicators.ipynb b/mobility-indicators.ipynb new file mode 100644 index 0000000..3eee303 --- /dev/null +++ b/mobility-indicators.ipynb @@ -0,0 +1,2268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mobility indicators\n", + "## Assessing the impact of mobility restrictions during a virus outbreak\n", + "\n", + "In this worked example...\n", + "\n", + "(Include link to COVID website)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Introduction\n", + "\n", + "- What we aim to do\n", + "- Summary of where we're heading, what prior knowledge is required, and what the end result will be" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run aggregates\n", + "\n", + "- Define queries\n", + "- Set aggregates running\n", + "- Get results" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "import flowclient as fc\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "token = " + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "conn = fc.connect(\n", + " url=\"https://api.flowcloud-ghana.flowminder.org\",\n", + " token=token,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "# Define date range\n", + "start_date = \"2016-02-01\"\n", + "end_date = \"2016-06-01\"\n", + "all_dates = pd.date_range(start_date, end_date, closed=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "# Total subscribers per day\n", + "\n", + "# E.g. query for 1 day\n", + "total_subscribers_20160101_query = fc.unique_subscriber_counts(\n", + " connection=conn,\n", + " start_date=\"2016-01-01\",\n", + " end_date=\"2016-01-02\",\n", + " aggregation_unit=\"admin0\",\n", + ")\n", + "\n", + "# Queries for all dates\n", + "total_subscribers_per_day_queries = {}\n", + "\n", + "for day in all_dates:\n", + " total_subscribers_per_day_queries[day] = fc.unique_subscriber_counts(\n", + " connection=conn,\n", + " start_date=str(day),\n", + " end_date=str(day + pd.Timedelta(\"1 day\")),\n", + " aggregation_unit=\"admin0\",\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "# Set them running\n", + "for query in total_subscribers_per_day_queries.values():\n", + " query.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "# Subscribers per admin1 per day\n", + "subscribers_per_admin1_per_day_queries = {\n", + " day: fc.unique_subscriber_counts(\n", + " connection=conn,\n", + " start_date=str(day),\n", + " end_date=str(day + pd.Timedelta(\"1 day\")),\n", + " aggregation_unit=\"admin1\",\n", + " )\n", + " for day in all_dates\n", + "}\n", + "\n", + "for query in subscribers_per_admin1_per_day_queries.values():\n", + " query.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "# Subscribers per admin2 per day\n", + "subscribers_per_admin2_per_day_queries = {\n", + " day: fc.unique_subscriber_counts(\n", + " connection=conn,\n", + " start_date=str(day),\n", + " end_date=str(day + pd.Timedelta(\"1 day\")),\n", + " aggregation_unit=\"admin2\",\n", + " )\n", + " for day in all_dates\n", + "}\n", + "\n", + "for query in subscribers_per_admin2_per_day_queries.values():\n", + " query.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "# OD matrix (admin2)\n", + "od_matrix_admin2_per_day_queries = {\n", + " day: fc.trips_od_matrix(\n", + " connection=conn,\n", + " start_date=str(day),\n", + " end_date=str(day + pd.Timedelta(\"1 day\")),\n", + " aggregation_unit=\"admin2\",\n", + " )\n", + " for day in all_dates\n", + "}\n", + "\n", + "for query in od_matrix_admin2_per_day_queries.values():\n", + " query.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "# Total events per day\n", + "\n", + "total_events_per_day_query = fc.location_event_counts(\n", + " connection=conn,\n", + " start_date=start_date,\n", + " end_date=end_date,\n", + " count_interval=\"day\",\n", + " aggregation_unit=\"admin0\",\n", + ")\n", + "\n", + "total_events_per_day_query.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "# Event counts (admin2 per day)\n", + "events_per_admin2_per_day_query = fc.location_event_counts(\n", + " connection=conn,\n", + " start_date=start_date,\n", + " end_date=end_date,\n", + " count_interval=\"day\",\n", + " aggregation_unit=\"admin2\",\n", + ")\n", + "\n", + "events_per_admin2_per_day_query.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total subscribers: Counter({'completed': 121})\n", + "Subscribers per admin1: Counter({'completed': 121})\n" + ] + }, + { + "ename": "FlowclientConnectionError", + "evalue": "Something went wrong: . API returned with status code: 500 and status 'errored'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFlowclientConnectionError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m ]:\n\u001b[1;32m 8\u001b[0m print(\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;34mf\"{label}: {Counter([query.status for query in query_group.values()])}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m ]:\n\u001b[1;32m 8\u001b[0m print(\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;34mf\"{label}: {Counter([query.status for query in query_group.values()])}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/api_query.py\u001b[0m in \u001b[0;36mstatus\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"_query_id\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"not_running\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 85\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mget_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_query_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m def get_result(\n", + "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/client.py\u001b[0m in \u001b[0;36mget_status\u001b[0;34m(connection, query_id)\u001b[0m\n\u001b[1;32m 117\u001b[0m \"\"\"\n\u001b[1;32m 118\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0mready\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_is_ready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileNotFoundError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;31m# Can't distinguish 'known', 'cancelled', 'resetting' and 'awol' from the error,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/client.py\u001b[0m in \u001b[0;36mquery_is_ready\u001b[0;34m(connection, query_id)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34mf\"Polling server on {connection.url}/api/{connection.api_version}/poll/{query_id}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m )\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroute\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"poll/{query_id}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m303\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/connection.py\u001b[0m in \u001b[0;36mget_url\u001b[0;34m(self, route, data)\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Unknown status\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 139\u001b[0;31m raise FlowclientConnectionError(\n\u001b[0m\u001b[1;32m 140\u001b[0m \u001b[0;34mf\"Something went wrong: {error}. API returned with status code: {response.status_code} and status '{status}'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m )\n", + "\u001b[0;31mFlowclientConnectionError\u001b[0m: Something went wrong: . API returned with status code: 500 and status 'errored'" + ] + } + ], + "source": [ + "# Check statuses\n", + "for label, query_group in [\n", + " (\"Total subscribers\", total_subscribers_per_day_queries),\n", + " (\"Subscribers per admin1\", subscribers_per_admin1_per_day_queries),\n", + " (\"Subscribers_per_admin2\", subscribers_per_admin2_per_day_queries),\n", + " (\"OD matrix\", od_matrix_admin2_per_day_queries),\n", + "]:\n", + " print(\n", + " f\"{label}: {Counter([query.status for query in query_group.values()])}\"\n", + " )\n", + "\n", + "print(f\"Total events: {total_events_per_day_query.status}\")\n", + "print(f\"Events per admin2: {events_per_admin2_per_day_query.status}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcodvalue
0GH2136902
\n", + "
" + ], + "text/plain": [ + " pcod value\n", + "0 GH 2136902" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get results\n", + "\n", + "# Get one result\n", + "first_result = total_subscribers_per_day_queries[all_dates[0]].get_result()\n", + "first_result" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcodvaluedate
0GH21369022016-02-01
\n", + "
" + ], + "text/plain": [ + " pcod value date\n", + "0 GH 2136902 2016-02-01" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Add date column\n", + "first_result[\"date\"] = all_dates[0]\n", + "first_result" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcodvaluedate
0GH21369022016-02-01
1GH21369172016-02-02
2GH21382072016-02-03
3GH21366912016-02-04
4GH21375452016-02-05
............
116GH21367742016-05-27
117GH21372982016-05-28
118GH21375512016-05-29
119GH21377482016-05-30
120GH21374702016-05-31
\n", + "

121 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " pcod value date\n", + "0 GH 2136902 2016-02-01\n", + "1 GH 2136917 2016-02-02\n", + "2 GH 2138207 2016-02-03\n", + "3 GH 2136691 2016-02-04\n", + "4 GH 2137545 2016-02-05\n", + ".. ... ... ...\n", + "116 GH 2136774 2016-05-27\n", + "117 GH 2137298 2016-05-28\n", + "118 GH 2137551 2016-05-29\n", + "119 GH 2137748 2016-05-30\n", + "120 GH 2137470 2016-05-31\n", + "\n", + "[121 rows x 3 columns]" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get all results with date columns\n", + "all_total_subscribers_results = [\n", + " query.get_result().assign(date=day)\n", + " for day, query in total_subscribers_per_day_queries.items()\n", + "]\n", + "\n", + "total_subscribers_per_day_results = pd.concat(all_total_subscribers_results, ignore_index=True)\n", + "\n", + "total_subscribers_per_day_results" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
valuedate
021369022016-02-01
121369172016-02-02
221382072016-02-03
321366912016-02-04
421375452016-02-05
.........
11621367742016-05-27
11721372982016-05-28
11821375512016-05-29
11921377482016-05-30
12021374702016-05-31
\n", + "

121 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " value date\n", + "0 2136902 2016-02-01\n", + "1 2136917 2016-02-02\n", + "2 2138207 2016-02-03\n", + "3 2136691 2016-02-04\n", + "4 2137545 2016-02-05\n", + ".. ... ...\n", + "116 2136774 2016-05-27\n", + "117 2137298 2016-05-28\n", + "118 2137551 2016-05-29\n", + "119 2137748 2016-05-30\n", + "120 2137470 2016-05-31\n", + "\n", + "[121 rows x 2 columns]" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Don't need \"pcod\" column, so let's drop it now\n", + "total_subscribers_per_day_results = total_subscribers_per_day_results.drop(columns=\"pcod\")\n", + "total_subscribers_per_day_results" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcodvaluedate
0GHA.10_12276802016-02-01
1GHA.1_14701412016-02-01
2GHA.2_12046972016-02-01
3GHA.3_12222212016-02-01
4GHA.4_12467792016-02-01
............
1205GHA.5_14559082016-05-31
1206GHA.6_12996732016-05-31
1207GHA.7_11175762016-05-31
1208GHA.8_1723022016-05-31
1209GHA.9_12485452016-05-31
\n", + "

1210 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " pcod value date\n", + "0 GHA.10_1 227680 2016-02-01\n", + "1 GHA.1_1 470141 2016-02-01\n", + "2 GHA.2_1 204697 2016-02-01\n", + "3 GHA.3_1 222221 2016-02-01\n", + "4 GHA.4_1 246779 2016-02-01\n", + "... ... ... ...\n", + "1205 GHA.5_1 455908 2016-05-31\n", + "1206 GHA.6_1 299673 2016-05-31\n", + "1207 GHA.7_1 117576 2016-05-31\n", + "1208 GHA.8_1 72302 2016-05-31\n", + "1209 GHA.9_1 248545 2016-05-31\n", + "\n", + "[1210 rows x 3 columns]" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get admin1 subscriber counts\n", + "subscribers_per_admin1_per_day_results = pd.concat(\n", + " [\n", + " query.get_result().assign(date=day)\n", + " for day, query in subscribers_per_admin1_per_day_queries.items()\n", + " ],\n", + " ignore_index=True,\n", + ")\n", + "\n", + "subscribers_per_admin1_per_day_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get admin2 subscriber counts\n", + "subscribers_per_admin2_per_day_results = pd.concat(\n", + " [\n", + " query.get_result().assign(date=day)\n", + " for day, query in subscribers_per_admin2_per_day_queries.items()\n", + " ],\n", + " ignore_index=True,\n", + ")\n", + "\n", + "subscribers_per_admin2_per_day_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get admin2 OD matrix results\n", + "od_matrix_admin2_per_day_results = pd.concat(\n", + " [\n", + " query.get_result().assign(date=day)\n", + " for day, query in od_matrix_admin2_per_day_queries.items()\n", + " ],\n", + " ignore_index=True,\n", + ")\n", + "\n", + "od_matrix_admin2_per_day_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get total event counts\n", + "total_events_per_day_results = total_events_per_day_query.get_result()\n", + "\n", + "# Don't need to add a date column because it already has one, but...\n", + "total_events_per_day_results.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Date column is 'object' type (i.e. string) - need to convert to datetime\n", + "total_events_per_day_results[\"date\"] = pd.to_datetime(total_events_per_day_results[\"date\"])\n", + "\n", + "total_events_per_day_results" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcoddatevalue
0GHA.10.10_12016-02-01136114
1GHA.10.1_12016-02-0128901
2GHA.10.11_12016-02-0118318
3GHA.10.12_12016-02-0129705
4GHA.10.13_12016-02-0147380
............
16500GHA.9.5_12016-05-3131992
16501GHA.9.6_12016-05-3113468
16502GHA.9.7_12016-05-3134588
16503GHA.9.8_12016-05-3153798
16504GHA.9.9_12016-05-3124071
\n", + "

16505 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " pcod date value\n", + "0 GHA.10.10_1 2016-02-01 136114\n", + "1 GHA.10.1_1 2016-02-01 28901\n", + "2 GHA.10.11_1 2016-02-01 18318\n", + "3 GHA.10.12_1 2016-02-01 29705\n", + "4 GHA.10.13_1 2016-02-01 47380\n", + "... ... ... ...\n", + "16500 GHA.9.5_1 2016-05-31 31992\n", + "16501 GHA.9.6_1 2016-05-31 13468\n", + "16502 GHA.9.7_1 2016-05-31 34588\n", + "16503 GHA.9.8_1 2016-05-31 53798\n", + "16504 GHA.9.9_1 2016-05-31 24071\n", + "\n", + "[16505 rows x 3 columns]" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get admin2 event counts\n", + "events_per_admin2_per_day_results = events_per_admin2_per_day_query.get_result()\n", + "events_per_admin2_per_day_results[\"date\"] = pd.to_datetime(events_per_admin2_per_day_results[\"date\"])\n", + "\n", + "events_per_admin2_per_day_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### QA checks\n", + "\n", + "- Additional queries (define, run, get results)\n", + "- Run checks\n", + " - Subscriber counts in urban vs rural\n", + " - sum(admin2 counts) / admin1 count ? (Maybe not - this might be an indicator)\n", + " - Stable over time (total, and per locality)\n", + " - Weekly variation (except there won't actually be any in the synthetic data)\n", + " - Events per subscriber - check sensible value and stable over time\n", + " - Missing data (temporal or spatial)\n", + "- Explain results (and how to tell whether they 'passed')\n", + "\n", + "**Note:** Don't need all checks, because some are covered by FlowKit tests (i.e. we need to QA check the data, not the implementation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Time-series plots\n", + "Should be stable over time and show weekly variation" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "total_subscribers_per_day_results.plot(x=\"date\", y=\"value\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Should be stable over time except for possible small changes due to new mobility restrictions\n", + "- Should show weekly variation (but not in the synthetic data)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "subscribers_per_admin1_per_day_results.pivot(index=\"date\", columns=\"pcod\", values=\"value\").plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "subscribers_per_admin2_per_day_results.pivot(index=\"date\", columns=\"pcod\", values=\"value\").plot(legend=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# May be clearer to plot sum\n", + "plt.figure()\n", + "subscribers_per_admin2_per_day_results.groupby(\"date\").sum().plot(ax=plt.gca(), y=\"value\", label=\"sum(subscribers per admin2)\")\n", + "\n", + "# Similar for OD matrix\n", + "od_matrix_admin2_per_day_results.groupby(\"date\").sum().plot(ax=plt.gca(), y=\"value\", label=\"sum(OD matrix)\")\n", + "\n", + "# Show total subscribers for comparison\n", + "total_subscribers_per_day_results.plot(ax=plt.gca(), x=\"date\", y=\"value\", label=\"total subscribers\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Should be stable over time except for changes due to new mobility restrictions (in this case, sharp changes are due to start of simulated \"disaster\" on 2016-03-01 and start of recovery period on 2016-03-10)\n", + "- Should show weekly variation (but not in the synthetic data)\n", + "- Explain why sum of admin2 is larger than total" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Spatial distribution of subscriber counts\n", + "Should be larger in urban districts, smaller in rural" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate median daily subscriber count per admin2\n", + "median_subscribers_per_admin2 = subscribers_per_admin2_per_day_results.groupby(\"pcod\").median()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
geometrypcodcentroid
0MULTIPOLYGON (((-1.41637 6.32333, -1.43242 6.3...GHA.1.1_1{'type': 'Point', 'coordinates': [-1.585607706...
1MULTIPOLYGON (((-1.54145 5.93492, -1.53550 5.9...GHA.1.2_1{'type': 'Point', 'coordinates': [-1.394701237...
2MULTIPOLYGON (((-1.41025 6.86558, -1.42795 6.8...GHA.1.3_1{'type': 'Point', 'coordinates': [-1.557937016...
3MULTIPOLYGON (((-2.29597 6.71882, -2.29462 6.7...GHA.1.4_1{'type': 'Point', 'coordinates': [-2.201639978...
4MULTIPOLYGON (((-1.81879 6.98329, -1.83513 6.9...GHA.1.5_1{'type': 'Point', 'coordinates': [-1.957634046...
............
132MULTIPOLYGON (((-2.36110 6.08630, -2.36740 6.0...GHA.10.9_1{'type': 'Point', 'coordinates': [-2.609585309...
133MULTIPOLYGON (((-1.82511 5.57554, -1.82662 5.5...GHA.10.11_1{'type': 'Point', 'coordinates': [-2.009582598...
134MULTIPOLYGON (((-1.70347 4.94569, -1.70347 4.9...GHA.10.10_1{'type': 'Point', 'coordinates': [-1.658353642...
135MULTIPOLYGON (((-2.59935 5.44091, -2.59728 5.4...GHA.10.12_1{'type': 'Point', 'coordinates': [-2.332295004...
136MULTIPOLYGON (((-1.97473 4.91827, -1.98106 4.9...GHA.10.13_1{'type': 'Point', 'coordinates': [-1.990833941...
\n", + "

137 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " geometry pcod \\\n", + "0 MULTIPOLYGON (((-1.41637 6.32333, -1.43242 6.3... GHA.1.1_1 \n", + "1 MULTIPOLYGON (((-1.54145 5.93492, -1.53550 5.9... GHA.1.2_1 \n", + "2 MULTIPOLYGON (((-1.41025 6.86558, -1.42795 6.8... GHA.1.3_1 \n", + "3 MULTIPOLYGON (((-2.29597 6.71882, -2.29462 6.7... GHA.1.4_1 \n", + "4 MULTIPOLYGON (((-1.81879 6.98329, -1.83513 6.9... GHA.1.5_1 \n", + ".. ... ... \n", + "132 MULTIPOLYGON (((-2.36110 6.08630, -2.36740 6.0... GHA.10.9_1 \n", + "133 MULTIPOLYGON (((-1.82511 5.57554, -1.82662 5.5... GHA.10.11_1 \n", + "134 MULTIPOLYGON (((-1.70347 4.94569, -1.70347 4.9... GHA.10.10_1 \n", + "135 MULTIPOLYGON (((-2.59935 5.44091, -2.59728 5.4... GHA.10.12_1 \n", + "136 MULTIPOLYGON (((-1.97473 4.91827, -1.98106 4.9... GHA.10.13_1 \n", + "\n", + " centroid \n", + "0 {'type': 'Point', 'coordinates': [-1.585607706... \n", + "1 {'type': 'Point', 'coordinates': [-1.394701237... \n", + "2 {'type': 'Point', 'coordinates': [-1.557937016... \n", + "3 {'type': 'Point', 'coordinates': [-2.201639978... \n", + "4 {'type': 'Point', 'coordinates': [-1.957634046... \n", + ".. ... \n", + "132 {'type': 'Point', 'coordinates': [-2.609585309... \n", + "133 {'type': 'Point', 'coordinates': [-2.009582598... \n", + "134 {'type': 'Point', 'coordinates': [-1.658353642... \n", + "135 {'type': 'Point', 'coordinates': [-2.332295004... \n", + "136 {'type': 'Point', 'coordinates': [-1.990833941... \n", + "\n", + "[137 rows x 3 columns]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get admin2 boundaries\n", + "admin2_geojson = fc.get_geography(connection=conn, aggregation_unit=\"admin2\")\n", + "admin2_gdf = gpd.GeoDataFrame.from_features(admin2_geojson)\n", + "admin2_gdf" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Join median subscriber counts to admin boundaries, and plot\n", + "median_subscribers_per_admin2_with_geo = admin2_gdf.merge(\n", + " median_subscribers_per_admin2, left_on=\"pcod\", right_index=True\n", + ")\n", + "\n", + "median_subscribers_per_admin2_with_geo.plot(column=\"value\", legend=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Should have larger subscriber counts in urban districts, and smaller in rural districts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Events per subscriber (for each admin2, then for whole country)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Total events per day\n", + "average_events_per_subscriber = total_events_per_day_results.set_index(\"date\").value / total_subscribers_per_day_results.set_index(\"date\").value\n", + "\n", + "average_events_per_subscriber.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Values should be reasonable (e.g. not 500 events per subscriber per day)\n", + "- Values should be >= 1 (because every active subscriber has at least 1 event\n", + "- Values should be fairly stable over time" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Per admin2\n", + "events_per_subscriber_per_admin2 = (\n", + " events_per_admin2_per_day_results.set_index([\"date\", \"pcod\"]) / subscribers_per_admin2_per_day_results.set_index([\"date\", \"pcod\"])\n", + ").reset_index()\n", + "events_per_subscriber_per_admin2.pivot(index=\"date\", columns=\"pcod\", values=\"value\").plot(legend=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Missing data\n", + "- Total events per day, over time" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "total_events_per_day_results.plot(x=\"date\", y=\"value\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- If there's an anomaly, show a map of events per admin2 on that day, to see if there's a spatial effect" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# E.g. if total event count was lower on 13 March, can plot event count per admin2 on that day\n", + "admin2_gdf.merge(\n", + " events_per_admin2_per_day_results[events_per_admin2_per_day_results.date == \"2016-03-14\"],\n", + " on=\"pcod\",\n", + ").plot(column=\"value\", legend=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If no data for parts of the country, suggests the CDR data are incomplete." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indicators\n", + "\n", + "- Calculate indicators from aggregates\n", + " - Scaled subscriber count per admin2 (including baseline calculation)\n", + " - Average admin2 visited per subscriber for each admin1\n", + " - Scaled OD matrix subscriber count\n", + "- Produce visualisations (not sure exactly what to show for OD matrix - maybe pick a few pairs and show line plots; could also do a before vs after map of Accra -> other)\n", + "- Explain what we see" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Scaled subscriber count per admin2\n", + "\n", + "- Divide subscriber count by total, to mitigate effects of changes in calling behaviour (i.e. assume actual number of people is ~ constant, so changes in total are due to people calling more/less)\n", + "- Calculate baseline average\n", + "- % of baseline" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcodvaluedatetotal_subscribersscaled_subscriber_count
0GHA.10.10_1649822016-02-0321382070.030391
1GHA.10.1_1138682016-02-0321382070.006486
2GHA.10.11_189342016-02-0321382070.004178
3GHA.10.12_1144852016-02-0321382070.006774
4GHA.10.13_1227582016-02-0321382070.010643
..................
5580GHA.9.5_1153982016-04-1121370140.007205
5581GHA.9.6_164032016-04-1121370140.002996
5582GHA.9.7_1164602016-04-1121370140.007702
5583GHA.9.8_1254062016-04-1121370140.011889
5584GHA.9.9_1118492016-04-1121370140.005545
\n", + "

5585 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " pcod value date total_subscribers \\\n", + "0 GHA.10.10_1 64982 2016-02-03 2138207 \n", + "1 GHA.10.1_1 13868 2016-02-03 2138207 \n", + "2 GHA.10.11_1 8934 2016-02-03 2138207 \n", + "3 GHA.10.12_1 14485 2016-02-03 2138207 \n", + "4 GHA.10.13_1 22758 2016-02-03 2138207 \n", + "... ... ... ... ... \n", + "5580 GHA.9.5_1 15398 2016-04-11 2137014 \n", + "5581 GHA.9.6_1 6403 2016-04-11 2137014 \n", + "5582 GHA.9.7_1 16460 2016-04-11 2137014 \n", + "5583 GHA.9.8_1 25406 2016-04-11 2137014 \n", + "5584 GHA.9.9_1 11849 2016-04-11 2137014 \n", + "\n", + " scaled_subscriber_count \n", + "0 0.030391 \n", + "1 0.006486 \n", + "2 0.004178 \n", + "3 0.006774 \n", + "4 0.010643 \n", + "... ... \n", + "5580 0.007205 \n", + "5581 0.002996 \n", + "5582 0.007702 \n", + "5583 0.011889 \n", + "5584 0.005545 \n", + "\n", + "[5585 rows x 5 columns]" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Merge admin2 and total subscribers\n", + "merged_subscriber_counts = subscribers_per_admin2_per_day_results.merge(\n", + " total_subscribers_per_day_results.rename(columns={\"value\": \"total_subscribers\"}), on=\"date\"\n", + ")\n", + "\n", + "# Divide sub count by total subs to get scaled sub count\n", + "merged_subscriber_counts[\"scaled_subscriber_count\"] = (\n", + " merged_subscriber_counts.value / merged_subscriber_counts.total_subscribers\n", + ")\n", + "\n", + "merged_subscriber_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pcod\n", + "GHA.1.10_1 0.004650\n", + "GHA.1.11_1 0.009366\n", + "GHA.1.12_1 0.005379\n", + "GHA.1.13_1 0.011214\n", + "GHA.1.14_1 0.006864\n", + " ... \n", + "GHA.9.5_1 0.005901\n", + "GHA.9.6_1 0.002443\n", + "GHA.9.7_1 0.006250\n", + "GHA.9.8_1 0.009675\n", + "GHA.9.9_1 0.004555\n", + "Name: scaled_subscriber_count, Length: 137, dtype: float64" + ] + }, + "execution_count": 182, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Baseline median\n", + "baseline_start = \"2016-02-01\"\n", + "baseline_end = \"2016-02-29\"\n", + "\n", + "scaled_subscriber_count_baseline = merged_subscriber_counts[\n", + " (merged_subscriber_counts.date >= baseline_start)\n", + " & (merged_subscriber_counts.date < baseline_end)\n", + "].groupby(\"pcod\").median().scaled_subscriber_count\n", + "scaled_subscriber_count_baseline" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcodvaluedatetotal_subscribersscaled_subscriber_countscaled_subscriber_count_baseline
0GHA.10.10_1649822016-02-0321382070.0303910.030399
137GHA.10.10_1651102016-02-0421366910.0304720.030399
274GHA.10.10_1649112016-02-0721371920.0303720.030399
411GHA.10.10_1649482016-02-1021368680.0303940.030399
548GHA.10.10_1651592016-02-1121370270.0304900.030399
.....................
5036GHA.9.9_1118122016-04-0721373040.0055270.004555
5173GHA.9.9_1117962016-04-0821377080.0055180.004555
5310GHA.9.9_1117702016-04-0921370870.0055070.004555
5447GHA.9.9_1118642016-04-1021359130.0055550.004555
5584GHA.9.9_1118492016-04-1121370140.0055450.004555
\n", + "

5585 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " pcod value date total_subscribers \\\n", + "0 GHA.10.10_1 64982 2016-02-03 2138207 \n", + "137 GHA.10.10_1 65110 2016-02-04 2136691 \n", + "274 GHA.10.10_1 64911 2016-02-07 2137192 \n", + "411 GHA.10.10_1 64948 2016-02-10 2136868 \n", + "548 GHA.10.10_1 65159 2016-02-11 2137027 \n", + "... ... ... ... ... \n", + "5036 GHA.9.9_1 11812 2016-04-07 2137304 \n", + "5173 GHA.9.9_1 11796 2016-04-08 2137708 \n", + "5310 GHA.9.9_1 11770 2016-04-09 2137087 \n", + "5447 GHA.9.9_1 11864 2016-04-10 2135913 \n", + "5584 GHA.9.9_1 11849 2016-04-11 2137014 \n", + "\n", + " scaled_subscriber_count scaled_subscriber_count_baseline \n", + "0 0.030391 0.030399 \n", + "137 0.030472 0.030399 \n", + "274 0.030372 0.030399 \n", + "411 0.030394 0.030399 \n", + "548 0.030490 0.030399 \n", + "... ... ... \n", + "5036 0.005527 0.004555 \n", + "5173 0.005518 0.004555 \n", + "5310 0.005507 0.004555 \n", + "5447 0.005555 0.004555 \n", + "5584 0.005545 0.004555 \n", + "\n", + "[5585 rows x 6 columns]" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_subscriber_counts = merged_subscriber_counts.merge(\n", + " scaled_subscriber_count_baseline,\n", + " left_on=\"pcod\",\n", + " right_index=True,\n", + " suffixes=(\"\", \"_baseline\")\n", + ")\n", + "\n", + "merged_subscriber_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pcodvaluedatetotal_subscribersscaled_subscriber_countscaled_subscriber_count_baselinepercent_change
0GHA.10.10_1649822016-02-0321382070.0303910.030399-0.027144
137GHA.10.10_1651102016-02-0421366910.0304720.0303990.240851
274GHA.10.10_1649112016-02-0721371920.0303720.030399-0.088948
411GHA.10.10_1649482016-02-1021368680.0303940.030399-0.016840
548GHA.10.10_1651592016-02-1121370270.0304900.0303990.300517
........................
5036GHA.9.9_1118122016-04-0721373040.0055270.00455521.333545
5173GHA.9.9_1117962016-04-0821377080.0055180.00455521.146293
5310GHA.9.9_1117702016-04-0921370870.0055070.00455520.914395
5447GHA.9.9_1118642016-04-1021359130.0055550.00455521.947058
5584GHA.9.9_1118492016-04-1121370140.0055450.00455521.730128
\n", + "

5585 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " pcod value date total_subscribers \\\n", + "0 GHA.10.10_1 64982 2016-02-03 2138207 \n", + "137 GHA.10.10_1 65110 2016-02-04 2136691 \n", + "274 GHA.10.10_1 64911 2016-02-07 2137192 \n", + "411 GHA.10.10_1 64948 2016-02-10 2136868 \n", + "548 GHA.10.10_1 65159 2016-02-11 2137027 \n", + "... ... ... ... ... \n", + "5036 GHA.9.9_1 11812 2016-04-07 2137304 \n", + "5173 GHA.9.9_1 11796 2016-04-08 2137708 \n", + "5310 GHA.9.9_1 11770 2016-04-09 2137087 \n", + "5447 GHA.9.9_1 11864 2016-04-10 2135913 \n", + "5584 GHA.9.9_1 11849 2016-04-11 2137014 \n", + "\n", + " scaled_subscriber_count scaled_subscriber_count_baseline \\\n", + "0 0.030391 0.030399 \n", + "137 0.030472 0.030399 \n", + "274 0.030372 0.030399 \n", + "411 0.030394 0.030399 \n", + "548 0.030490 0.030399 \n", + "... ... ... \n", + "5036 0.005527 0.004555 \n", + "5173 0.005518 0.004555 \n", + "5310 0.005507 0.004555 \n", + "5447 0.005555 0.004555 \n", + "5584 0.005545 0.004555 \n", + "\n", + " percent_change \n", + "0 -0.027144 \n", + "137 0.240851 \n", + "274 -0.088948 \n", + "411 -0.016840 \n", + "548 0.300517 \n", + "... ... \n", + "5036 21.333545 \n", + "5173 21.146293 \n", + "5310 20.914395 \n", + "5447 21.947058 \n", + "5584 21.730128 \n", + "\n", + "[5585 rows x 7 columns]" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_subscriber_counts[\"percent_change\"] = (\n", + " merged_subscriber_counts.scaled_subscriber_count / merged_subscriber_counts.scaled_subscriber_count_baseline - 1\n", + ") * 100\n", + "merged_subscriber_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(-100.0, 31.561113800097285)" + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Choose a few districts to show\n", + "display_districts = {\n", + " \"Accra\": \"GHA.5.1_1\",\n", + " \"Kumasi\": \"GHA.1.16_1\",\n", + " \"Tamale\": \"GHA.6.13_1\",\n", + "}\n", + "\n", + "# Plot % change over time for the chosen districts\n", + "plt.figure()\n", + "for name, pcod in display_districts.items():\n", + " merged_subscriber_counts[merged_subscriber_counts.pcod == pcod].plot(\n", + " ax=plt.gca(), x=\"date\", y=\"percent_change\", marker=\".\", ls=\"\", label=name\n", + " )\n", + "plt.axhline(0, ls=\":\", c='k')\n", + "plt.ylim(bottom=-100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Average admin2 per subscriber in each admin1\n", + "\n", + "- Add admin1 pcode column to admin2 counts\n", + "- Sum admin2 counts per admin1\n", + "- Join to admin1 counts\n", + "- Divide\n", + "- Plot over time for all admin1 regions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Scaled OD matrix counts\n", + "- Same process as for scaled admin2 counts\n", + "- Show a map of before vs after for either Accra or Kumasi (not sure which will look better)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary\n", + "\n", + "- What we did\n", + "- What it showed\n", + "- Maybe some advice on how this could be modified to do other things" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From d877dd70b905ba0978edbb6b898129b87d0c49c0 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sat, 1 Aug 2020 20:20:02 +0100 Subject: [PATCH 2/3] Add descriptions --- mobility-indicators.ipynb | 3189 +++++++++++++++---------------------- 1 file changed, 1284 insertions(+), 1905 deletions(-) diff --git a/mobility-indicators.ipynb b/mobility-indicators.ipynb index 3eee303..02dd7bc 100644 --- a/mobility-indicators.ipynb +++ b/mobility-indicators.ipynb @@ -7,9 +7,7 @@ "# Mobility indicators\n", "## Assessing the impact of mobility restrictions during a virus outbreak\n", "\n", - "In this worked example...\n", - "\n", - "(Include link to COVID website)" + "In this worked example we will assume the role of an analyst in Ghana monitoring the effects of government restrictions on mobility patterns during a virus outbreak. This is based on material produced during the COVID-19 pandemic, which can be found on Flowminder's [COVID-19 website](https://covid19.flowminder.org/) and [GitHub repository](https://github.com/Flowminder/COVID-19)." ] }, { @@ -18,58 +16,84 @@ "source": [ "### Introduction\n", "\n", - "- What we aim to do\n", - "- Summary of where we're heading, what prior knowledge is required, and what the end result will be" + "This example is written assuming that the reader has worked through the following tutorials, to gain familiarity with the use of FlowKit to produce CDR aggregates:\n", + "\n", + "- [Getting started with FlowClient](01-getting-started-with-flowclient.ipynb)\n", + "- [Running a query](02-running-a-query.ipynb)\n", + "- [Geography](03-geography.ipynb)\n", + "\n", + "In this example, we will first use FlowKit to produce some aggregates from CDR data. We will then perform some checks on these aggregates, to identify any potential issues with the underlying data. Finally, we will use the aggregates to produce some mobility indicators, which will give us insights into changes in population density and movements around the country over time." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Run aggregates\n", + "### Connect to FlowKit server\n", "\n", - "- Define queries\n", - "- Set aggregates running\n", - "- Get results" + "We'll start by creating a connection to the Ghana FlowCloud FlowKit server, following the instructions in the [\"Getting started with FlowClient\" tutorial](01-getting-started-with-flowclient.ipynb). First we import the FlowClient library, along with pandas, geopandas and matplotlib which we'll use later for further analysis." ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import flowclient as fc\n", - "import pandas as pd\n", "import geopandas as gpd\n", "import matplotlib.pyplot as plt\n", - "from collections import Counter" + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can get a token from [FlowAuth](https://auth.flowcloud-ghana.flowminder.org/) and use it to create a FlowKit connection:" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "token = " + "token =" ] }, { "cell_type": "code", - "execution_count": 83, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "conn = fc.connect(\n", - " url=\"https://api.flowcloud-ghana.flowminder.org\",\n", - " token=token,\n", - ")" + "conn = fc.connect(url=\"https://api.flowcloud-ghana.flowminder.org\", token=token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run aggregates\n", + "\n", + "The first step is to run the aggregate queries that we will need for this analysis. The aggregates we will need are:\n", + "\n", + "1. Total subscriber count per day, for the whole country (admin0). This is the total number of unique subscribers who used their phone anywhere in the country, each day.\n", + "2. Subscriber count per region per day (admin1). This is similar to \"subscriber count per day\", but with a separate unique subscriber count for each region.\n", + "3. Subscriber count per district per day (admin2). This is similar to \"subscriber count per region per day\", but will count subscribers per district (admin2) instead of per region (admin1).\n", + "4. District-level OD matrix per day. For each pair of districts A and B each day, the OD matrix contains the count of subscribers who were active in district A and then active in district B later the same day.\n", + "5. Count of CDR events per day, for the whole country (admin0). This will not be used in the final analysis in this example, but will be useful when we check the aggregate results to ensure there are no issues with the underlying CDR data.\n", + "6. Count of CDR events per district per day (admin2). This is similar to \"count of CDR events per day\", but with a separate event count for each district. Again, this will be useful when we perform some quality checks on the query results.\n", + "\n", + "We need to specify the date range for which we want to run the queries. The mobility restrictions started on 1 March 2016, so we'll get results from the beginning of February (1 month before restrictions began) until the end of May (3 months after restrictions began). In a real-world scenario we could update this analysis every few days using the latest available CDR data, to get a near-real-time picture of mobility in Ghana. We'll make a list of all dates in our date range, using the pandas `date_range` function.\n", + "\n", + "**Note:** We will use the FlowKit convention that date intervals include the lower bound (start date) and exclude the upper bound (end date)." ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -79,26 +103,50 @@ "all_dates = pd.date_range(start_date, end_date, closed=\"left\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now define and run our FlowKit queries, following instructions in the [\"Running a query\" tutorial](02-running-a-query.ipynb).\n", + "\n", + "#### 1. Total subscriber count per day\n", + "\n", + "For \"total subscriber count per day\", we can use `unique_subscriber_counts` queries and set `aggregation_unit=\"admin0\"` - this will count the total number of unique active subscribers in the country in a specified time period. We will need a separate query for each day in our date range. For example, to count the total number of active subscribers on 2 February 2016 we can use the following query:" + ] + }, { "cell_type": "code", - "execution_count": 85, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Total subscribers per day\n", - "\n", - "# E.g. query for 1 day\n", - "total_subscribers_20160101_query = fc.unique_subscriber_counts(\n", + "total_subscribers_20160201_query = fc.unique_subscriber_counts(\n", " connection=conn,\n", - " start_date=\"2016-01-01\",\n", - " end_date=\"2016-01-02\",\n", + " start_date=\"2016-02-01\",\n", + " end_date=\"2016-02-02\",\n", " aggregation_unit=\"admin0\",\n", - ")\n", - "\n", - "# Queries for all dates\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a dictionary of \"total subscribers per day\" queries for all the days in our date range. To specify a 1-day time interval for each query, we need the `end_date` to be the day _after_ the `start_date` - we can do this by looping over the dates in `all_dates`, setting each date as the `start_date` of a query, and adding `pd.Timedelta(\"1 day\")` to that date to get the `end_date`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an empty dictionary\n", "total_subscribers_per_day_queries = {}\n", "\n", + "# Loop over the days in our date range\n", "for day in all_dates:\n", + " # Create a query object for this day, and add it to the dictionary\n", " total_subscribers_per_day_queries[day] = fc.unique_subscriber_counts(\n", " connection=conn,\n", " start_date=str(day),\n", @@ -107,25 +155,43 @@ " )" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have defined our \"total subscribers per day\" queries, we can ask the FlowKit server to run them by calling the `run()` method of each query object." + ] + }, { "cell_type": "code", - "execution_count": 86, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Set them running\n", "for query in total_subscribers_per_day_queries.values():\n", " query.run()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These queries may take a while to run, so we can continue to define and run the other queries we need while waiting for the results to become available.\n", + "\n", + "#### 2. Subscriber count per region per day\n", + "\n", + "For \"subscriber count per region per day\", we can use `unique_subscriber_counts` queries again, but this time set `aggregation_unit=admin1` (the admin1 divisions are the regions of Ghana). These queries will count the number of active subscribers in each region during the specified time interval.\n", + "\n", + "Let's create a dictionary of \"subscribers per region per day\" query objects, like we did for \"total subscribers per day\". We can use a dictionary comprehension to express this in a more compact form:" + ] + }, { "cell_type": "code", - "execution_count": 87, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Subscribers per admin1 per day\n", - "subscribers_per_admin1_per_day_queries = {\n", + "subscribers_per_region_per_day_queries = {\n", " day: fc.unique_subscriber_counts(\n", " connection=conn,\n", " start_date=str(day),\n", @@ -133,20 +199,42 @@ " aggregation_unit=\"admin1\",\n", " )\n", " for day in all_dates\n", - "}\n", - "\n", - "for query in subscribers_per_admin1_per_day_queries.values():\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And again, we can now use the `run()` method to ask FlowKit to run these queries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for query in subscribers_per_region_per_day_queries.values():\n", " query.run()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Subscriber count per district per day\n", + "\n", + "Similar to \"subscriber count per region per day\", we can calculate the subscriber count per district per day using `unique_subscriber_counts` queries with `aggregation_unit=admin2` (district level). We can define and run these queries as we did for \"subscriber count per region per day\":" + ] + }, { "cell_type": "code", - "execution_count": 146, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Subscribers per admin2 per day\n", - "subscribers_per_admin2_per_day_queries = {\n", + "subscribers_per_district_per_day_queries = {\n", " day: fc.unique_subscriber_counts(\n", " connection=conn,\n", " start_date=str(day),\n", @@ -156,18 +244,28 @@ " for day in all_dates\n", "}\n", "\n", - "for query in subscribers_per_admin2_per_day_queries.values():\n", + "for query in subscribers_per_district_per_day_queries.values():\n", " query.run()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. District-level OD matrix per day\n", + "\n", + "For the OD (origin-destination) matrix, we can use `trips_od_matrix` queries. For each pair of locations A and B, a `trips_od_matrix` query will count the number of unique subscribers who were active in A and then later active in B during the specified time period.\n", + "\n", + "The parameters of `trips_od_matrix` are the same as the parameters of `unique_subscriber_counts`: a `start_date` and `end_date` specifying a time interval, and an `aggregation_unit`. We'll set `aggregation_unit=\"admin2\"`, to get counts of subscribers moving between districts. We will create a dictionary of query objects and set them all running, as we did with the previous queries." + ] + }, { "cell_type": "code", - "execution_count": 89, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# OD matrix (admin2)\n", - "od_matrix_admin2_per_day_queries = {\n", + "od_matrix_district_per_day_queries = {\n", " day: fc.trips_od_matrix(\n", " connection=conn,\n", " start_date=str(day),\n", @@ -177,18 +275,32 @@ " for day in all_dates\n", "}\n", "\n", - "for query in od_matrix_admin2_per_day_queries.values():\n", + "for query in od_matrix_district_per_day_queries.values():\n", " query.run()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5. Total event count per day\n", + "\n", + "The total number of CDR events (calls, SMS, data sessions, etc.) per day will be useful information for checking whether there are any issues with the underlying CDR data within FlowKit. For example, this would help to identify any periods with missing data. We can use a `location_event_counts` query to count the number of events per day. This query has the following parameters:\n", + "\n", + "- `start_date`: the first date in the time period\n", + "- `end_date`: the day _after_ the end of the time period\n", + "- `count_interval`: the length of time intervals within which events should be counted (e.g. `\"minute\"`, `\"hour\"`, `\"day\"`)\n", + "- `aggregation_unit`: the level of spatial aggregation unit for which events will be counted\n", + "\n", + "By setting `count_interval=\"day\"`, we can get the event count for each day using a single query (unlike the `unique_subscriber_counts` and `trips_od_matrix` queries we defined earlier, where a separate query was required for each day). We will set `start_date` and `end_date` to be the start and end dates of the full date range, as we defined them at the beginning of this notebook, and we will set `aggregation_unit=\"admin0\"` to get the total event count for the entire country." + ] + }, { "cell_type": "code", - "execution_count": 111, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Total events per day\n", - "\n", "total_events_per_day_query = fc.location_event_counts(\n", " connection=conn,\n", " start_date=start_date,\n", @@ -200,14 +312,22 @@ "total_events_per_day_query.run()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6. Event count per district per day\n", + "\n", + "We can also use a `location_event_counts` query to get a separate event count per day for each district, which will be useful to identify whether there are any data issues that only affect certain geographic areas. This query is identical to the \"total events per day\" query above, except that we've changed the aggregation unit to `\"admin2\"` to get an event count for each district." + ] + }, { "cell_type": "code", - "execution_count": 112, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Event counts (admin2 per day)\n", - "events_per_admin2_per_day_query = fc.location_event_counts(\n", + "events_per_district_per_day_query = fc.location_event_counts(\n", " connection=conn,\n", " start_date=start_date,\n", " end_date=end_date,\n", @@ -215,581 +335,32 @@ " aggregation_unit=\"admin2\",\n", ")\n", "\n", - "events_per_admin2_per_day_query.run()" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total subscribers: Counter({'completed': 121})\n", - "Subscribers per admin1: Counter({'completed': 121})\n" - ] - }, - { - "ename": "FlowclientConnectionError", - "evalue": "Something went wrong: . API returned with status code: 500 and status 'errored'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFlowclientConnectionError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m ]:\n\u001b[1;32m 8\u001b[0m print(\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;34mf\"{label}: {Counter([query.status for query in query_group.values()])}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m ]:\n\u001b[1;32m 8\u001b[0m print(\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;34mf\"{label}: {Counter([query.status for query in query_group.values()])}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/api_query.py\u001b[0m in \u001b[0;36mstatus\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"_query_id\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"not_running\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 85\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mget_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_query_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m def get_result(\n", - "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/client.py\u001b[0m in \u001b[0;36mget_status\u001b[0;34m(connection, query_id)\u001b[0m\n\u001b[1;32m 117\u001b[0m \"\"\"\n\u001b[1;32m 118\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0mready\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_is_ready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquery_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileNotFoundError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;31m# Can't distinguish 'known', 'cancelled', 'resetting' and 'awol' from the error,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/client.py\u001b[0m in \u001b[0;36mquery_is_ready\u001b[0;34m(connection, query_id)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34mf\"Polling server on {connection.url}/api/{connection.api_version}/poll/{query_id}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m )\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroute\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"poll/{query_id}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m303\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/share/virtualenvs/FlowKit-tutorials-Ghana-XGVjzS5J/lib/python3.8/site-packages/flowclient/connection.py\u001b[0m in \u001b[0;36mget_url\u001b[0;34m(self, route, data)\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Unknown status\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 139\u001b[0;31m raise FlowclientConnectionError(\n\u001b[0m\u001b[1;32m 140\u001b[0m \u001b[0;34mf\"Something went wrong: {error}. API returned with status code: {response.status_code} and status '{status}'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m )\n", - "\u001b[0;31mFlowclientConnectionError\u001b[0m: Something went wrong: . API returned with status code: 500 and status 'errored'" - ] - } - ], - "source": [ - "# Check statuses\n", - "for label, query_group in [\n", - " (\"Total subscribers\", total_subscribers_per_day_queries),\n", - " (\"Subscribers per admin1\", subscribers_per_admin1_per_day_queries),\n", - " (\"Subscribers_per_admin2\", subscribers_per_admin2_per_day_queries),\n", - " (\"OD matrix\", od_matrix_admin2_per_day_queries),\n", - "]:\n", - " print(\n", - " f\"{label}: {Counter([query.status for query in query_group.values()])}\"\n", - " )\n", - "\n", - "print(f\"Total events: {total_events_per_day_query.status}\")\n", - "print(f\"Events per admin2: {events_per_admin2_per_day_query.status}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcodvalue
0GH2136902
\n", - "
" - ], - "text/plain": [ - " pcod value\n", - "0 GH 2136902" - ] - }, - "execution_count": 93, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get results\n", - "\n", - "# Get one result\n", - "first_result = total_subscribers_per_day_queries[all_dates[0]].get_result()\n", - "first_result" + "events_per_district_per_day_query.run()" ] }, { - "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcodvaluedate
0GH21369022016-02-01
\n", - "
" - ], - "text/plain": [ - " pcod value date\n", - "0 GH 2136902 2016-02-01" - ] - }, - "execution_count": 94, - "metadata": {}, - "output_type": "execute_result" - } - ], + "cell_type": "markdown", + "metadata": {}, "source": [ - "# Add date column\n", - "first_result[\"date\"] = all_dates[0]\n", - "first_result" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcodvaluedate
0GH21369022016-02-01
1GH21369172016-02-02
2GH21382072016-02-03
3GH21366912016-02-04
4GH21375452016-02-05
............
116GH21367742016-05-27
117GH21372982016-05-28
118GH21375512016-05-29
119GH21377482016-05-30
120GH21374702016-05-31
\n", - "

121 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " pcod value date\n", - "0 GH 2136902 2016-02-01\n", - "1 GH 2136917 2016-02-02\n", - "2 GH 2138207 2016-02-03\n", - "3 GH 2136691 2016-02-04\n", - "4 GH 2137545 2016-02-05\n", - ".. ... ... ...\n", - "116 GH 2136774 2016-05-27\n", - "117 GH 2137298 2016-05-28\n", - "118 GH 2137551 2016-05-29\n", - "119 GH 2137748 2016-05-30\n", - "120 GH 2137470 2016-05-31\n", - "\n", - "[121 rows x 3 columns]" - ] - }, - "execution_count": 95, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get all results with date columns\n", - "all_total_subscribers_results = [\n", - " query.get_result().assign(date=day)\n", - " for day, query in total_subscribers_per_day_queries.items()\n", - "]\n", + "#### Check status of queries\n", "\n", - "total_subscribers_per_day_results = pd.concat(all_total_subscribers_results, ignore_index=True)\n", - "\n", - "total_subscribers_per_day_results" + "We can check the status of a query by looking at the value of its `status` property. Once the result of a query is ready, its status will be `'completed'`. For example, we can check the status of `total_events_per_day_query` like this:" ] }, { "cell_type": "code", - "execution_count": 157, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
valuedate
021369022016-02-01
121369172016-02-02
221382072016-02-03
321366912016-02-04
421375452016-02-05
.........
11621367742016-05-27
11721372982016-05-28
11821375512016-05-29
11921377482016-05-30
12021374702016-05-31
\n", - "

121 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " value date\n", - "0 2136902 2016-02-01\n", - "1 2136917 2016-02-02\n", - "2 2138207 2016-02-03\n", - "3 2136691 2016-02-04\n", - "4 2137545 2016-02-05\n", - ".. ... ...\n", - "116 2136774 2016-05-27\n", - "117 2137298 2016-05-28\n", - "118 2137551 2016-05-29\n", - "119 2137748 2016-05-30\n", - "120 2137470 2016-05-31\n", - "\n", - "[121 rows x 2 columns]" - ] - }, - "execution_count": 157, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Don't need \"pcod\" column, so let's drop it now\n", - "total_subscribers_per_day_results = total_subscribers_per_day_results.drop(columns=\"pcod\")\n", - "total_subscribers_per_day_results" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcodvaluedate
0GHA.10_12276802016-02-01
1GHA.1_14701412016-02-01
2GHA.2_12046972016-02-01
3GHA.3_12222212016-02-01
4GHA.4_12467792016-02-01
............
1205GHA.5_14559082016-05-31
1206GHA.6_12996732016-05-31
1207GHA.7_11175762016-05-31
1208GHA.8_1723022016-05-31
1209GHA.9_12485452016-05-31
\n", - "

1210 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " pcod value date\n", - "0 GHA.10_1 227680 2016-02-01\n", - "1 GHA.1_1 470141 2016-02-01\n", - "2 GHA.2_1 204697 2016-02-01\n", - "3 GHA.3_1 222221 2016-02-01\n", - "4 GHA.4_1 246779 2016-02-01\n", - "... ... ... ...\n", - "1205 GHA.5_1 455908 2016-05-31\n", - "1206 GHA.6_1 299673 2016-05-31\n", - "1207 GHA.7_1 117576 2016-05-31\n", - "1208 GHA.8_1 72302 2016-05-31\n", - "1209 GHA.9_1 248545 2016-05-31\n", - "\n", - "[1210 rows x 3 columns]" - ] - }, - "execution_count": 96, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get admin1 subscriber counts\n", - "subscribers_per_admin1_per_day_results = pd.concat(\n", - " [\n", - " query.get_result().assign(date=day)\n", - " for day, query in subscribers_per_admin1_per_day_queries.items()\n", - " ],\n", - " ignore_index=True,\n", - ")\n", - "\n", - "subscribers_per_admin1_per_day_results" + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Total event counts:\", total_events_per_day_query.status)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Get admin2 subscriber counts\n", - "subscribers_per_admin2_per_day_results = pd.concat(\n", - " [\n", - " query.get_result().assign(date=day)\n", - " for day, query in subscribers_per_admin2_per_day_queries.items()\n", - " ],\n", - " ignore_index=True,\n", - ")\n", - "\n", - "subscribers_per_admin2_per_day_results" + "For the subscriber counts and OD matrix queries we have a separate query for each day, and each query could have a different status. A convenient way to check the status of all the queries in the `total_subscribers_per_day_queries` dictionary is to use a [Counter](https://docs.python.org/2/library/collections.html#collections.Counter):" ] }, { @@ -798,29 +369,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Get admin2 OD matrix results\n", - "od_matrix_admin2_per_day_results = pd.concat(\n", - " [\n", - " query.get_result().assign(date=day)\n", - " for day, query in od_matrix_admin2_per_day_queries.items()\n", - " ],\n", - " ignore_index=True,\n", - ")\n", + "from collections import Counter\n", "\n", - "od_matrix_admin2_per_day_results" + "print(\n", + " \"Total subscriber counts:\",\n", + " Counter([query.status for query in total_subscribers_per_day_queries.values()]),\n", + ")" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Get total event counts\n", - "total_events_per_day_results = total_events_per_day_query.get_result()\n", - "\n", - "# Don't need to add a date column because it already has one, but...\n", - "total_events_per_day_results.info()" + "Let's check the status of all our queries:" ] }, { @@ -829,1407 +390,1221 @@ "metadata": {}, "outputs": [], "source": [ - "# Date column is 'object' type (i.e. string) - need to convert to datetime\n", - "total_events_per_day_results[\"date\"] = pd.to_datetime(total_events_per_day_results[\"date\"])\n", + "for label, query_group in [\n", + " (\"Total subscriber counts\", total_subscribers_per_day_queries),\n", + " (\"Subscriber counts per region\", subscribers_per_region_per_day_queries),\n", + " (\"Subscriber counts per district\", subscribers_per_district_per_day_queries),\n", + " (\"OD matrix (district level)\", od_matrix_district_per_day_queries),\n", + "]:\n", + " print(label, Counter([query.status for query in query_group.values()]))\n", "\n", - "total_events_per_day_results" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcoddatevalue
0GHA.10.10_12016-02-01136114
1GHA.10.1_12016-02-0128901
2GHA.10.11_12016-02-0118318
3GHA.10.12_12016-02-0129705
4GHA.10.13_12016-02-0147380
............
16500GHA.9.5_12016-05-3131992
16501GHA.9.6_12016-05-3113468
16502GHA.9.7_12016-05-3134588
16503GHA.9.8_12016-05-3153798
16504GHA.9.9_12016-05-3124071
\n", - "

16505 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " pcod date value\n", - "0 GHA.10.10_1 2016-02-01 136114\n", - "1 GHA.10.1_1 2016-02-01 28901\n", - "2 GHA.10.11_1 2016-02-01 18318\n", - "3 GHA.10.12_1 2016-02-01 29705\n", - "4 GHA.10.13_1 2016-02-01 47380\n", - "... ... ... ...\n", - "16500 GHA.9.5_1 2016-05-31 31992\n", - "16501 GHA.9.6_1 2016-05-31 13468\n", - "16502 GHA.9.7_1 2016-05-31 34588\n", - "16503 GHA.9.8_1 2016-05-31 53798\n", - "16504 GHA.9.9_1 2016-05-31 24071\n", - "\n", - "[16505 rows x 3 columns]" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get admin2 event counts\n", - "events_per_admin2_per_day_results = events_per_admin2_per_day_query.get_result()\n", - "events_per_admin2_per_day_results[\"date\"] = pd.to_datetime(events_per_admin2_per_day_results[\"date\"])\n", - "\n", - "events_per_admin2_per_day_results" + "print(f\"Total event counts: {total_events_per_day_query.status}\")\n", + "print(f\"Event counts per district: {events_per_district_per_day_query.status}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### QA checks\n", + "### Get query results\n", "\n", - "- Additional queries (define, run, get results)\n", - "- Run checks\n", - " - Subscriber counts in urban vs rural\n", - " - sum(admin2 counts) / admin1 count ? (Maybe not - this might be an indicator)\n", - " - Stable over time (total, and per locality)\n", - " - Weekly variation (except there won't actually be any in the synthetic data)\n", - " - Events per subscriber - check sensible value and stable over time\n", - " - Missing data (temporal or spatial)\n", - "- Explain results (and how to tell whether they 'passed')\n", + "#### 1. Total subscriber count per day\n", "\n", - "**Note:** Don't need all checks, because some are covered by FlowKit tests (i.e. we need to QA check the data, not the implementation)" + "Once our queries have finished running, we can get the results using the `get_result` method. For example, to get the result of the \"total subscribers per day\" query for the first day in our date range:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "#### Time-series plots\n", - "Should be stable over time and show weekly variation" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "total_subscribers_per_day_results.plot(x=\"date\", y=\"value\")" + "first_result = total_subscribers_per_day_queries[all_dates[0]].get_result()\n", + "first_result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Should be stable over time except for possible small changes due to new mobility restrictions\n", - "- Should show weekly variation (but not in the synthetic data)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "subscribers_per_admin1_per_day_results.pivot(index=\"date\", columns=\"pcod\", values=\"value\").plot()" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 142, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "subscribers_per_admin2_per_day_results.pivot(index=\"date\", columns=\"pcod\", values=\"value\").plot(legend=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 152, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 152, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# May be clearer to plot sum\n", - "plt.figure()\n", - "subscribers_per_admin2_per_day_results.groupby(\"date\").sum().plot(ax=plt.gca(), y=\"value\", label=\"sum(subscribers per admin2)\")\n", + "This result has two columns:\n", "\n", - "# Similar for OD matrix\n", - "od_matrix_admin2_per_day_results.groupby(\"date\").sum().plot(ax=plt.gca(), y=\"value\", label=\"sum(OD matrix)\")\n", + "- `pcod` is the P-code. Since this result is at admin0 level, there is only one P-code (\"GH\" is the P-code for the whole of Ghana).\n", + "- `value` is the unique subscriber count.\n", "\n", - "# Show total subscribers for comparison\n", - "total_subscribers_per_day_results.plot(ax=plt.gca(), x=\"date\", y=\"value\", label=\"total subscribers\")" + "This is the result for a single day. It will be convenient for us to combine the \"total subscribers per day\" results for all days into a single DataFrame - so that we know which result corresponds to which date, we can add a 'date' column:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "- Should be stable over time except for changes due to new mobility restrictions (in this case, sharp changes are due to start of simulated \"disaster\" on 2016-03-01 and start of recovery period on 2016-03-10)\n", - "- Should show weekly variation (but not in the synthetic data)\n", - "- Explain why sum of admin2 is larger than total" + "# Add date column\n", + "first_result[\"date\"] = all_dates[0]\n", + "first_result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Spatial distribution of subscriber counts\n", - "Should be larger in urban districts, smaller in rural" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Calculate median daily subscriber count per admin2\n", - "median_subscribers_per_admin2 = subscribers_per_admin2_per_day_results.groupby(\"pcod\").median()" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
geometrypcodcentroid
0MULTIPOLYGON (((-1.41637 6.32333, -1.43242 6.3...GHA.1.1_1{'type': 'Point', 'coordinates': [-1.585607706...
1MULTIPOLYGON (((-1.54145 5.93492, -1.53550 5.9...GHA.1.2_1{'type': 'Point', 'coordinates': [-1.394701237...
2MULTIPOLYGON (((-1.41025 6.86558, -1.42795 6.8...GHA.1.3_1{'type': 'Point', 'coordinates': [-1.557937016...
3MULTIPOLYGON (((-2.29597 6.71882, -2.29462 6.7...GHA.1.4_1{'type': 'Point', 'coordinates': [-2.201639978...
4MULTIPOLYGON (((-1.81879 6.98329, -1.83513 6.9...GHA.1.5_1{'type': 'Point', 'coordinates': [-1.957634046...
............
132MULTIPOLYGON (((-2.36110 6.08630, -2.36740 6.0...GHA.10.9_1{'type': 'Point', 'coordinates': [-2.609585309...
133MULTIPOLYGON (((-1.82511 5.57554, -1.82662 5.5...GHA.10.11_1{'type': 'Point', 'coordinates': [-2.009582598...
134MULTIPOLYGON (((-1.70347 4.94569, -1.70347 4.9...GHA.10.10_1{'type': 'Point', 'coordinates': [-1.658353642...
135MULTIPOLYGON (((-2.59935 5.44091, -2.59728 5.4...GHA.10.12_1{'type': 'Point', 'coordinates': [-2.332295004...
136MULTIPOLYGON (((-1.97473 4.91827, -1.98106 4.9...GHA.10.13_1{'type': 'Point', 'coordinates': [-1.990833941...
\n", - "

137 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " geometry pcod \\\n", - "0 MULTIPOLYGON (((-1.41637 6.32333, -1.43242 6.3... GHA.1.1_1 \n", - "1 MULTIPOLYGON (((-1.54145 5.93492, -1.53550 5.9... GHA.1.2_1 \n", - "2 MULTIPOLYGON (((-1.41025 6.86558, -1.42795 6.8... GHA.1.3_1 \n", - "3 MULTIPOLYGON (((-2.29597 6.71882, -2.29462 6.7... GHA.1.4_1 \n", - "4 MULTIPOLYGON (((-1.81879 6.98329, -1.83513 6.9... GHA.1.5_1 \n", - ".. ... ... \n", - "132 MULTIPOLYGON (((-2.36110 6.08630, -2.36740 6.0... GHA.10.9_1 \n", - "133 MULTIPOLYGON (((-1.82511 5.57554, -1.82662 5.5... GHA.10.11_1 \n", - "134 MULTIPOLYGON (((-1.70347 4.94569, -1.70347 4.9... GHA.10.10_1 \n", - "135 MULTIPOLYGON (((-2.59935 5.44091, -2.59728 5.4... GHA.10.12_1 \n", - "136 MULTIPOLYGON (((-1.97473 4.91827, -1.98106 4.9... GHA.10.13_1 \n", - "\n", - " centroid \n", - "0 {'type': 'Point', 'coordinates': [-1.585607706... \n", - "1 {'type': 'Point', 'coordinates': [-1.394701237... \n", - "2 {'type': 'Point', 'coordinates': [-1.557937016... \n", - "3 {'type': 'Point', 'coordinates': [-2.201639978... \n", - "4 {'type': 'Point', 'coordinates': [-1.957634046... \n", - ".. ... \n", - "132 {'type': 'Point', 'coordinates': [-2.609585309... \n", - "133 {'type': 'Point', 'coordinates': [-2.009582598... \n", - "134 {'type': 'Point', 'coordinates': [-1.658353642... \n", - "135 {'type': 'Point', 'coordinates': [-2.332295004... \n", - "136 {'type': 'Point', 'coordinates': [-1.990833941... \n", - "\n", - "[137 rows x 3 columns]" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get admin2 boundaries\n", - "admin2_geojson = fc.get_geography(connection=conn, aggregation_unit=\"admin2\")\n", - "admin2_gdf = gpd.GeoDataFrame.from_features(admin2_geojson)\n", - "admin2_gdf" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Join median subscriber counts to admin boundaries, and plot\n", - "median_subscribers_per_admin2_with_geo = admin2_gdf.merge(\n", - " median_subscribers_per_admin2, left_on=\"pcod\", right_index=True\n", - ")\n", - "\n", - "median_subscribers_per_admin2_with_geo.plot(column=\"value\", legend=True)" + "The most convenient way to combine the \"total subscribers per day\" results for all dates into a single DataFrame is to use the pandas `concat` function. First, we can use a list comprehension to get a list of individual result DataFrames (`.assign(date=day)` adds the value of `day` in a new `date` column):" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "- Should have larger subscriber counts in urban districts, and smaller in rural districts" + "all_total_subscribers_results = [\n", + " query.get_result().assign(date=day)\n", + " for day, query in total_subscribers_per_day_queries.items()\n", + "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Events per subscriber (for each admin2, then for whole country)" + "Now we can use `pd.concat` to concatenate all the DataFrames in the list `all_total_subscribers_results` into a single DataFrame (the `ignore_index=True` option ensures that each row in the final DataFrame has a unique index, rather than keeping the indexes from the individual DataFrames):" ] }, { "cell_type": "code", - "execution_count": 124, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 124, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "# Total events per day\n", - "average_events_per_subscriber = total_events_per_day_results.set_index(\"date\").value / total_subscribers_per_day_results.set_index(\"date\").value\n", + "total_subscribers_per_day_results = pd.concat(\n", + " all_total_subscribers_results, ignore_index=True\n", + ")\n", "\n", - "average_events_per_subscriber.plot()" + "total_subscribers_per_day_results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Values should be reasonable (e.g. not 500 events per subscriber per day)\n", - "- Values should be >= 1 (because every active subscriber has at least 1 event\n", - "- Values should be fairly stable over time" + "The `total_subscribers_per_day_results` DataFrame contains the total daily subscriber count for each day in our date range. The `pcod` column is not useful in this case - it has the same value for every row, so we can drop it:" ] }, { "cell_type": "code", - "execution_count": 141, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 141, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "# Per admin2\n", - "events_per_subscriber_per_admin2 = (\n", - " events_per_admin2_per_day_results.set_index([\"date\", \"pcod\"]) / subscribers_per_admin2_per_day_results.set_index([\"date\", \"pcod\"])\n", - ").reset_index()\n", - "events_per_subscriber_per_admin2.pivot(index=\"date\", columns=\"pcod\", values=\"value\").plot(legend=False)" + "total_subscribers_per_day_results = total_subscribers_per_day_results.drop(\n", + " columns=\"pcod\"\n", + ")\n", + "total_subscribers_per_day_results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Missing data\n", - "- Total events per day, over time" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "#### 2. Subscriber count per region per day\n", + "\n", + "We can get the \"subscribers per region per day\" results in the same way. This time we're getting the list of individual query results within the call to `pd.concat()` instead of assigning it to a new variable first, but the process here is the same." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "total_events_per_day_results.plot(x=\"date\", y=\"value\")" + "subscribers_per_region_per_day_results = pd.concat(\n", + " [\n", + " query.get_result().assign(date=day)\n", + " for day, query in subscribers_per_region_per_day_queries.items()\n", + " ],\n", + " ignore_index=True,\n", + ")\n", + "\n", + "subscribers_per_region_per_day_results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- If there's an anomaly, show a map of events per admin2 on that day, to see if there's a spatial effect" + "This time the `pcod` column contains the level 1 P-code that identifies each region. There are 10 regions, so we get 10 subscriber counts per day. In this case the `pcod` column contains useful information, so we'll keep it.\n", + "\n", + "#### 3. Subscriber count per district per day\n", + "\n", + "We can get the \"subscribers per district per day\" results in the same way. This DataFrame is larger than `subscribers_per_region_per_day_results`, because there are 137 districts." ] }, { "cell_type": "code", - "execution_count": 156, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 156, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "# E.g. if total event count was lower on 13 March, can plot event count per admin2 on that day\n", - "admin2_gdf.merge(\n", - " events_per_admin2_per_day_results[events_per_admin2_per_day_results.date == \"2016-03-14\"],\n", - " on=\"pcod\",\n", - ").plot(column=\"value\", legend=True)" + "subscribers_per_district_per_day_results = pd.concat(\n", + " [\n", + " query.get_result().assign(date=day)\n", + " for day, query in subscribers_per_district_per_day_queries.items()\n", + " ],\n", + " ignore_index=True,\n", + ")\n", + "\n", + "subscribers_per_district_per_day_results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If no data for parts of the country, suggests the CDR data are incomplete." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Indicators\n", - "\n", - "- Calculate indicators from aggregates\n", - " - Scaled subscriber count per admin2 (including baseline calculation)\n", - " - Average admin2 visited per subscriber for each admin1\n", - " - Scaled OD matrix subscriber count\n", - "- Produce visualisations (not sure exactly what to show for OD matrix - maybe pick a few pairs and show line plots; could also do a before vs after map of Accra -> other)\n", - "- Explain what we see" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Scaled subscriber count per admin2\n", - "\n", - "- Divide subscriber count by total, to mitigate effects of changes in calling behaviour (i.e. assume actual number of people is ~ constant, so changes in total are due to people calling more/less)\n", - "- Calculate baseline average\n", - "- % of baseline" - ] - }, - { - "cell_type": "code", - "execution_count": 181, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcodvaluedatetotal_subscribersscaled_subscriber_count
0GHA.10.10_1649822016-02-0321382070.030391
1GHA.10.1_1138682016-02-0321382070.006486
2GHA.10.11_189342016-02-0321382070.004178
3GHA.10.12_1144852016-02-0321382070.006774
4GHA.10.13_1227582016-02-0321382070.010643
..................
5580GHA.9.5_1153982016-04-1121370140.007205
5581GHA.9.6_164032016-04-1121370140.002996
5582GHA.9.7_1164602016-04-1121370140.007702
5583GHA.9.8_1254062016-04-1121370140.011889
5584GHA.9.9_1118492016-04-1121370140.005545
\n", - "

5585 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " pcod value date total_subscribers \\\n", - "0 GHA.10.10_1 64982 2016-02-03 2138207 \n", - "1 GHA.10.1_1 13868 2016-02-03 2138207 \n", - "2 GHA.10.11_1 8934 2016-02-03 2138207 \n", - "3 GHA.10.12_1 14485 2016-02-03 2138207 \n", - "4 GHA.10.13_1 22758 2016-02-03 2138207 \n", - "... ... ... ... ... \n", - "5580 GHA.9.5_1 15398 2016-04-11 2137014 \n", - "5581 GHA.9.6_1 6403 2016-04-11 2137014 \n", - "5582 GHA.9.7_1 16460 2016-04-11 2137014 \n", - "5583 GHA.9.8_1 25406 2016-04-11 2137014 \n", - "5584 GHA.9.9_1 11849 2016-04-11 2137014 \n", - "\n", - " scaled_subscriber_count \n", - "0 0.030391 \n", - "1 0.006486 \n", - "2 0.004178 \n", - "3 0.006774 \n", - "4 0.010643 \n", - "... ... \n", - "5580 0.007205 \n", - "5581 0.002996 \n", - "5582 0.007702 \n", - "5583 0.011889 \n", - "5584 0.005545 \n", - "\n", - "[5585 rows x 5 columns]" - ] - }, - "execution_count": 181, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Merge admin2 and total subscribers\n", - "merged_subscriber_counts = subscribers_per_admin2_per_day_results.merge(\n", - " total_subscribers_per_day_results.rename(columns={\"value\": \"total_subscribers\"}), on=\"date\"\n", - ")\n", + "#### 4. District-level OD matrix per day\n", "\n", - "# Divide sub count by total subs to get scaled sub count\n", + "The process for getting the OD matrix results is the same:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "od_matrix_district_per_day_results = pd.concat(\n", + " [\n", + " query.get_result().assign(date=day)\n", + " for day, query in od_matrix_district_per_day_queries.items()\n", + " ],\n", + " ignore_index=True,\n", + ")\n", + "\n", + "od_matrix_district_per_day_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This DataFrame has four columns:\n", + "\n", + "- `pcod_from` is the P-code of the first district,\n", + "- `pcod_to` is the P-code of the second district,\n", + "- `value` is the number of unique subscribers who were active in `pcod_from` and later `pcod_to`,\n", + "- `date` is the date on which subscribers were active.\n", + "\n", + "#### 5. Total event count per day\n", + "\n", + "\"Total events per day\" is a single query, so we can get the full result using a single call to `get_result()`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_events_per_day_results = total_events_per_day_query.get_result()\n", + "total_events_per_day_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This has three columns: `pcod` (the P-code for Ghana - this is always \"GH\"), `date` (the date on which events were counted) and `value` (the count of CDR events on that date). At first glance it looks like this is in the format we need, but let's check the information about this DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_events_per_day_results.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The datatype of the `date` column is 'object' (i.e. string). For later analysis, it will be useful to convert this column to 'datetime' data type, which we can do using the pandas `to_datetime` function. Let's also drop the \"pcod\" column (as we did for \"total subscribers per day\"), becuse this doesn't contain useful information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_events_per_day_results[\"date\"] = pd.to_datetime(\n", + " total_events_per_day_results[\"date\"]\n", + ")\n", + "\n", + "total_events_per_day_results = total_events_per_day_results.drop(columns=[\"pcod\"])\n", + "\n", + "total_events_per_day_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6. Event count per district per day\n", + "\n", + "\"Events per district per day\" is a single query, like \"total events per day\", so we can get the result in the same way. Again, we'll convert the `date` column to 'datetime' type. This time the `pcod` column is important (it identifies the districts), so we'll keep it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "events_per_district_per_day_results = events_per_district_per_day_query.get_result()\n", + "events_per_district_per_day_results[\"date\"] = pd.to_datetime(\n", + " events_per_district_per_day_results[\"date\"]\n", + ")\n", + "\n", + "events_per_district_per_day_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### QA checks\n", + "\n", + "It is possible that some of the CDR data available in FlowKit may be incomplete or incorrect. If this is the case, conclusions we draw from our aggregates may be incorrect or misleading, so it is important to perform some quality checks on our query results before we use them to investigate mobility patterns. In this section we'll go through some of these checks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Time-series plots\n", + "\n", + "A simple check we can perform is to plot the subscriber counts over time, and look for any unexpected changes. For example, we can plot the total subscriber count per day:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_subscribers_per_day_results.plot(x=\"date\", y=\"value\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the total subscriber count per day remains stable over time, except for some very small fluctuations. If there were any large changes, this could indicate either missing data or changes in calling behaviour, either of which would make the insights from these results unreliable.\n", + "\n", + "**Note:** In real data, we would expect to see a repeating weekly pattern of variation in these results - for example, the number of active subscribers on a Sunday may typically be lower than on a weekday. The data we're using here are synthetic, and there is no such pattern present.\n", + "\n", + "We can make a similar plot of subscriber counts per region, using the pandas `pivot` method to transform the DataFrame into a structure that can easily be plotted as multiple lines:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subscribers_per_region_per_day_results.pivot(\n", + " index=\"date\", columns=\"pcod\", values=\"value\"\n", + ").plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we can see that there is a very large drop in the subscriber counts in 'GHA.1_1' (Ashanti region) on 1st March, followed by a smaller increase on 10th March. Subscriber counts in all other regions increase when the subscriber counts in Ashanti decrease. This is due to the modelled mobility pattern in the synthetic dataset, in which all subscribers were forced to leave the Ashanti region on 1st March and were allowed to begin returning from 10th March. So in this case, the changes seen here are due to \"real\" mobility changes, rather than problems with the data.\n", + "\n", + "**Note:** In a real-life scenario we would not usually expect changes due to mobility restrictions to be as large or as sudden as those seen here. However, such changes would typically be visible in these plots, so care should be taken when trying to determine whether a change is a \"real\" effect or a data issue.\n", + "\n", + "We can make a similar plot of district-level subscriber counts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subscribers_per_district_per_day_results.pivot(\n", + " index=\"date\", columns=\"pcod\", values=\"value\"\n", + ").plot(legend=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again we can see the sharp changes on 1st and 10th March. Due to the large number of districts, it may be more insightful to plot the sum of subscriber counts across all districts. Let's include the \"total subscribers per day\" data in the same plot, for comparison - we can do this by creating a matplotlib figure, and specifying `ax=plt.gca()` (\"get current axes\") to plot multiple results in the same plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a matplotlib figure\n", + "plt.figure()\n", + "\n", + "# Show total subscriber count for comparison\n", + "total_subscribers_per_day_results.plot(\n", + " ax=plt.gca(), x=\"date\", y=\"value\", label=\"total subscribers\"\n", + ")\n", + "\n", + "# Plot the sum of per-district subscriber counts across all districts\n", + "subscribers_per_district_per_day_results.groupby(\"date\").sum().plot(\n", + " ax=plt.gca(), y=\"value\", label=\"sum(subscribers per district)\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the sum of subscriber counts across all districts is also stable over time (again, if we were using real data we'd expect to see weekly variation, and changes due to mobility restrictions). Also, the sum of district-level subscriber counts is always larger than the total subscriber count - this is because some subscribers will be active in multiple districts on the same day, so they will be counted multiple times in the sum, whereas each subscriber is counted only once in the country-level total subscriber count.\n", + "\n", + "Let's also plot the sum of OD matrix counts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()\n", + "\n", + "# Show total subscriber count for comparison\n", + "total_subscribers_per_day_results.plot(\n", + " ax=plt.gca(), x=\"date\", y=\"value\", label=\"total subscribers\"\n", + ")\n", + "\n", + "# Plot the sum of per-district subscriber counts across all districts\n", + "subscribers_per_district_per_day_results.groupby(\"date\").sum().plot(\n", + " ax=plt.gca(), y=\"value\", label=\"sum(subscribers per district)\"\n", + ")\n", + "\n", + "# Similar for OD matrix\n", + "od_matrix_district_per_day_results.groupby(\"date\").sum().plot(\n", + " ax=plt.gca(), y=\"value\", label=\"sum(OD matrix)\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The sum of OD matrix counts is smaller than the total subscriber count - this is not unexpected, because some subscribers will only be active in a single district so will not be included in the OD matrix.\n", + "\n", + "There is an effect from the mobility changes on 1st and 10th March, but again there is no obvious sign of data issues." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Spatial distribution of subscriber counts\n", + "\n", + "Another thing we can check is the spatial distribution of subscriber counts. WE eould expect subscriber counts in urban districts to be larger than those in rural districts. To check this, let's calculate the median subscriber count per district:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "median_subscribers_per_district = subscribers_per_district_per_day_results.groupby(\n", + " \"pcod\"\n", + ").median()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To plot the median subscriber counts on a map, we need to get the geographic boundaries of the districts. We can do this using the flowclient `get_geography` function, as described in the [\"Geography\" tutorial](03-geography.ipynb):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "districts_geojson = fc.get_geography(connection=conn, aggregation_unit=\"admin2\")\n", + "districts_gdf = gpd.GeoDataFrame.from_features(districts_geojson)\n", + "districts_gdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now join the district boundaries to the median subscriber counts, and display them on a map:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "median_subscribers_per_district_with_geo = districts_gdf.merge(\n", + " median_subscribers_per_district, left_on=\"pcod\", right_index=True\n", + ")\n", + "\n", + "median_subscribers_per_district_with_geo.plot(column=\"value\", legend=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the subscriber count in Accra is much larger than in other districts, as we would expect.\n", + "\n", + "It is difficult to compare the subscriber counts in other districts on this colour scale. It would be clearer to us a log scale - we can do this using `matplotlib.colors.LogNorm`. First, we import the `matplotlib.colors` library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.colors" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can plot the median subscriber counts in the same way as we did before, but additionally specifying the `norm` parameter. We use `matplotlib.colors.LogNorm()`, which takes two arguments `vmin` and `vmax` (the minimum and maximum values in the colour range):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "median_subscribers_per_district_with_geo.plot(\n", + " column=\"value\",\n", + " legend=True,\n", + " norm=matplotlib.colors.LogNorm(\n", + " vmin=median_subscribers_per_district_with_geo.value.min(),\n", + " vmax=median_subscribers_per_district_with_geo.value.max(),\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the cities of Accra, Kumasi, Tamale and Sekondi-Takoradi all have higher subscriber counts than their surrounding districts, which aligns with what we would expect. Subscriber counts in the Ashanti region are smaller - this is in line with the time-series plot we made earlier of subscriber counts per region." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Events per subscriber\n", + "\n", + "We can also check the average number of events per subscriber per day. We should always have at least 1 event per subscriber (becuase a subscriber with no events would not be present in the dataset), but this value should not be too large - it would be hard to believe that subscribers make 100 calls per day on average, for example. A \"reasonable\" value here will depend on which CDR data types are included - there would typically be more mobile data sessions than phone calls per subscriber per day.\n", + "\n", + "The number of events per subscriber should be fairly stable over time. If this is not the case, it could indicate that there have been substantial changes in calling behaviour, which could make our results unreliable.\n", + "\n", + "We can calculate the average events per subscriber for the whole country by dividing the total event count per day (from `total_events_per_day_results`) by the total subscriber count per day (from `total_subscribers_per_day_results`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "average_events_per_subscriber = (\n", + " total_events_per_day_results.set_index(\"date\").value\n", + " / total_subscribers_per_day_results.set_index(\"date\").value\n", + ")\n", + "\n", + "average_events_per_subscriber.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the average number of events per subscriber is around 2.3, which is reasonable, and there are no large changes over time.\n", + "\n", + "Similarly, we can calculate the average number of events per subscriber in each district by dividing event counts in `events_per_district_per_day_results` by subscriber counts in `subscribers_per_district_per_day_results`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "events_per_subscriber_per_district = (\n", + " events_per_district_per_day_results.set_index([\"date\", \"pcod\"])\n", + " / subscribers_per_district_per_day_results.set_index([\"date\", \"pcod\"])\n", + ").reset_index()\n", + "\n", + "events_per_subscriber_per_district.pivot(\n", + " index=\"date\", columns=\"pcod\", values=\"value\"\n", + ").plot(legend=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The changes on 1st and 10th March correspond to a large decrease in the number of events per subscriber in some districts. This could be because the residents of these districts have moved away, so the subscribers we see just made brief visits to the district, or could be because subscribers in these districts are making fewer calls than usual. In either case, we should be cautious when interpreting a change in subscriber count as a corresponding change in population size, as some of the apparent change may be due to different phone usage." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Missing data\n", + "\n", + "It is possible that some of the CDR data ingested into FlowKit's database may be incomplete - there may be missing data on certain days, or for certain geographic areas. To check for missing data we can plot the total event count per day:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_events_per_day_results.plot(x=\"date\", y=\"value\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, we can see that the event count is quite stable over time, and there are no days with a much smaller event count than usual.\n", + "\n", + "If we noticed that the event count on one day was significantly smaller than usual, we could look at the district-level subscriber counts to see whether there are areas with missing data. For example, to plot the event counts per district on 21st February (using a log colour scale, as we did previously):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "events_per_district_20160221 = events_per_district_per_day_results[\n", + " events_per_district_per_day_results.date == \"2016-02-21\"\n", + "]\n", + "\n", + "districts_gdf.merge(events_per_district_20160221, on=\"pcod\",).plot(\n", + " column=\"value\",\n", + " legend=True,\n", + " norm=matplotlib.colors.LogNorm(\n", + " vmin=events_per_district_20160221.value.min(),\n", + " vmax=events_per_district_20160221.value.max(),\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If there were parts of the country with significantly lower event counts than the rest, this could indicate missing data.\n", + "\n", + "If the aggregates are re-calculated regularly from new available dates of CDR data, these QA checks should be updated each time to check for issues in the new data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mobility indicators\n", + "\n", + "The aggregates we have calculated do not reveal any personal information about subscribers. However, the raw aggregates are not appropriate to share with a wider audience, for two reasons:\n", + "\n", + "1. The aggregates may reveal commercially sensitive information (for example, the size or geographic distribution of an MNO's subscriber base),\n", + "2. The aggregates could be misleading (for example, a subscriber count of 10000 could be wrongly interpreted as a population count of 10000 people, whereas the true relationship between subscriber count and population count would depend on the MNO's market share, subscribers' calling behaviour, and other factors).\n", + "\n", + "In this section we will use the aggregates to calculate some \"mobility indicators\", which are scaled so that they do not reveal the absolute size or distribution of subscriber counts, while aiming to represent relevant mobility information.\n", + "\n", + "We will calculate three mobility indicators:\n", + "\n", + "1. Percentage change in subscriber presence per district\n", + "2. Average number of districts visited per subscriber within each region\n", + "3. Percentage change in subscriber movements between districts\n", + "\n", + "Details of other mobility indicators can be found on our [COVID-19 website](https://covid19.flowminder.org/).\n", + "\n", + "We need to choose a \"baseline period\" - this is a period before mobility restrictions began, which is assumed to display normal mobility. The baseline period should be at least 4 weeks long, so that baseline averages are not overly skewed by unusual events such as public holidays, and should be a whole number of weeks so that baseline averages are not skewed by including a larger number of Sundays than weekdays (for example). The first mobility restrictions in our example started on 1 March 2016, so we will choose the 4 weeks immediately before this as our baseline period:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_start = \"2016-02-02\"\n", + "baseline_end = \"2016-03-01\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is also useful, if possible, to identify a stable period during the period of mobility restrictions. In our example, the most recent change in mobility restrictions occurred on 10 March 2016, so we will take all dates from 10 March onwards as our stable period:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stable_period_start = \"2016-03-10\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Percentage change in subscriber presence per district\n", + "\n", + "This indicator is the percentage change in the number of subscribers in a district, relative to the median number of subscribers during the baseline period.\n", + "\n", + "The subscriber counts we have calculated could be affected by calling behaviour - if fewer subscribers make calls on a particular day, the subscriber count will be lower although those subscribers may still be present in a district, without making a call. To correct for this effect, we can divide the subscriber counts per district by the total subcriber count for the country each day. Assuming that the total number of people in the country doesn't significantly change during the period we're investigating, this will mitigate the effects of changes in calling behaviour.\n", + "\n", + "To calculate the \"scaled subscriber count\" per district (i.e. the subscriber count per district divided by the total subscriber count for the country), we first need to merge the district-level subscriber counts with the total subscriber counts, joining on the 'date' column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "merged_subscriber_counts = subscribers_per_district_per_day_results.merge(\n", + " total_subscribers_per_day_results.rename(columns={\"value\": \"total_subscribers\"}),\n", + " on=\"date\",\n", + ")\n", + "\n", + "merged_subscriber_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now add a new `\"scaled_subscriber_count\"` column to this DataFrame, calculated as the district-level subscriber count divided by the total subscriber count:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "merged_subscriber_counts[\"scaled_subscriber_count\"] = (\n", " merged_subscriber_counts.value / merged_subscriber_counts.total_subscribers\n", ")\n", "\n", - "merged_subscriber_counts" + "merged_subscriber_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to calculate the percentage change in the scaled subscriber count relative to the baseline period, so we need to define a single \"baseline\" scaled subscriber count for each district. We define this as the median of the daily scaled subscriber counts over all days in the baseline period. To calculate this, we filter the `merged_subscriber_counts` DataFrame to include only dates within the baseline period, then group by district (identified by the \"pcod\" column) and find the median of the \"scaled_subscriber_count\" column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scaled_subscriber_count_baseline = (\n", + " merged_subscriber_counts[\n", + " (merged_subscriber_counts.date >= baseline_start)\n", + " & (merged_subscriber_counts.date < baseline_end)\n", + " ]\n", + " .groupby(\"pcod\")\n", + " .median()\n", + " .scaled_subscriber_count\n", + ")\n", + "\n", + "scaled_subscriber_count_baseline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can merge these baseline values into the `merged_subscriber_counts` DataFrame as a new \"scaled_subscriber_count_baseline\" column, joining on the \"pcod\" column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "merged_subscriber_counts = merged_subscriber_counts.merge(\n", + " scaled_subscriber_count_baseline,\n", + " left_on=\"pcod\",\n", + " right_index=True,\n", + " suffixes=(\"\", \"_baseline\"),\n", + ")\n", + "\n", + "merged_subscriber_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now calculate the percentage change as\n", + "$$\n", + "100 \\times \\left(\\frac{\\mathrm{scaled\\ subscriber\\ count}}{\\mathrm{scaled\\ subscriber\\ count\\ baseline}} - 1\\right)\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "merged_subscriber_counts[\"percent_change\"] = (\n", + " merged_subscriber_counts.scaled_subscriber_count\n", + " / merged_subscriber_counts.scaled_subscriber_count_baseline\n", + " - 1\n", + ") * 100\n", + "\n", + "merged_subscriber_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's choose a few key districts, and plot the percentage change in subscriber presence over time:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_districts = {\n", + " \"Accra\": \"GHA.5.1_1\",\n", + " \"Kumasi\": \"GHA.1.16_1\",\n", + " \"Tamale\": \"GHA.6.13_1\",\n", + "}\n", + "\n", + "plt.figure()\n", + "for name, pcod in display_districts.items():\n", + " merged_subscriber_counts[merged_subscriber_counts.pcod == pcod].plot(\n", + " ax=plt.gca(), x=\"date\", y=\"percent_change\", marker=\".\", ls=\"\", label=name\n", + " )\n", + "plt.axhline(0, ls=\":\", c=\"k\")\n", + "plt.ylim(bottom=-100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the subscriber presence in Kumasi decreased by more than 80% from early March, compared to the baseline period. In fact, there are no data for 1-10 March, suggesting that there were no active subscribers at all in Kumasi during this time. Meanwhile, the subscriber presence in Accra and Tamale increased by 20%. It appears that almost everybody in Kumasi has moved away to other districts.\n", + "\n", + "We can also look at the percentage changes in all districts, on a map. Since the percentage change will be different each day, let's find the median percentage change per district over the \"stable period\" from 10 March onwards:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "median_subscriber_count_percent_change = (\n", + " merged_subscriber_counts[merged_subscriber_counts.date >= stable_period_start]\n", + " .groupby(\"pcod\")\n", + " .median()\n", + " .percent_change\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now join the district boundaries to the median percentage changes and show these on a map (we'll use the \"Spectral\" colour map with a colour scale from -100% to 100%):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "districts_gdf.merge(\n", + " median_subscriber_count_percent_change, left_on=\"pcod\", right_index=True\n", + ").plot(\n", + " column=\"percent_change\",\n", + " legend=True,\n", + " cmap=\"Spectral\",\n", + " norm=plt.Normalize(vmin=-100, vmax=100),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that subscriber presence in the Ashanti region decreased by 80% following mobility restrictions, and subscriber presence across the rest of the country increased by around 20%." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Average number of districts visited per subscriber within each region\n", + "\n", + "We can also look at the average number of districts visited per subscriber within each region. This is an indicator of the amount of movement in each region - if people move around less, they will be seen in fewer districts.\n", + "\n", + "The average number of districts per subscriber within each region is the sum of subscriber counts for all districts within a region, divided by the overall subscriber count for that region.\n", + "\n", + "We will start by mapping each district to a region. We can do this by looking at the P-codes - district \"GHA.X.Y_1\" is within region \"GHA.X_1\", so we can find the region P-code from the district P-code by taking everything before the final `\".\"`, and appending `\"_1\"`. Let's add this as a new \"region_pcod\" column in the `subscribers_per_district_per_day` DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subscribers_per_district_per_day_results[\n", + " \"region_pcod\"\n", + "] = subscribers_per_district_per_day_results.pcod.apply(\n", + " lambda x: x.rsplit(\".\", 1)[0] + \"_1\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now group by date and region, and sum the subscriber counts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "districts_per_region_stats = (\n", + " subscribers_per_district_per_day_results.groupby([\"date\", \"region_pcod\"])\n", + " .sum()\n", + " .reset_index()\n", + ")\n", + "\n", + "# Rename column to be more explicit\n", + "districts_per_region_stats = districts_per_region_stats.rename(\n", + " columns={\"value\": \"sum_district_subscriber_counts\"}\n", + ")\n", + "\n", + "districts_per_region_stats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can merge the summed district-level subscriber counts with the region-level subscriber counts:" ] }, { "cell_type": "code", - "execution_count": 182, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pcod\n", - "GHA.1.10_1 0.004650\n", - "GHA.1.11_1 0.009366\n", - "GHA.1.12_1 0.005379\n", - "GHA.1.13_1 0.011214\n", - "GHA.1.14_1 0.006864\n", - " ... \n", - "GHA.9.5_1 0.005901\n", - "GHA.9.6_1 0.002443\n", - "GHA.9.7_1 0.006250\n", - "GHA.9.8_1 0.009675\n", - "GHA.9.9_1 0.004555\n", - "Name: scaled_subscriber_count, Length: 137, dtype: float64" - ] - }, - "execution_count": 182, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Baseline median\n", - "baseline_start = \"2016-02-01\"\n", - "baseline_end = \"2016-02-29\"\n", - "\n", - "scaled_subscriber_count_baseline = merged_subscriber_counts[\n", - " (merged_subscriber_counts.date >= baseline_start)\n", - " & (merged_subscriber_counts.date < baseline_end)\n", - "].groupby(\"pcod\").median().scaled_subscriber_count\n", - "scaled_subscriber_count_baseline" + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Merge with region counts\n", + "districts_per_region_stats = districts_per_region_stats.merge(\n", + " subscribers_per_region_per_day_results,\n", + " left_on=[\"date\", \"region_pcod\"],\n", + " right_on=[\"date\", \"pcod\"],\n", + ")\n", + "\n", + "# Rename value column\n", + "districts_per_region_stats = districts_per_region_stats.rename(\n", + " columns={\"value\": \"region_subscriber_count\"}\n", + ")\n", + "\n", + "districts_per_region_stats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can calculate the average number of districts visited per subscriber per day within each region, by dividing the summed district-level subscrber counts by the region-level subscriber count:" ] }, { "cell_type": "code", - "execution_count": 183, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcodvaluedatetotal_subscribersscaled_subscriber_countscaled_subscriber_count_baseline
0GHA.10.10_1649822016-02-0321382070.0303910.030399
137GHA.10.10_1651102016-02-0421366910.0304720.030399
274GHA.10.10_1649112016-02-0721371920.0303720.030399
411GHA.10.10_1649482016-02-1021368680.0303940.030399
548GHA.10.10_1651592016-02-1121370270.0304900.030399
.....................
5036GHA.9.9_1118122016-04-0721373040.0055270.004555
5173GHA.9.9_1117962016-04-0821377080.0055180.004555
5310GHA.9.9_1117702016-04-0921370870.0055070.004555
5447GHA.9.9_1118642016-04-1021359130.0055550.004555
5584GHA.9.9_1118492016-04-1121370140.0055450.004555
\n", - "

5585 rows × 6 columns

\n", - "
" - ], - "text/plain": [ - " pcod value date total_subscribers \\\n", - "0 GHA.10.10_1 64982 2016-02-03 2138207 \n", - "137 GHA.10.10_1 65110 2016-02-04 2136691 \n", - "274 GHA.10.10_1 64911 2016-02-07 2137192 \n", - "411 GHA.10.10_1 64948 2016-02-10 2136868 \n", - "548 GHA.10.10_1 65159 2016-02-11 2137027 \n", - "... ... ... ... ... \n", - "5036 GHA.9.9_1 11812 2016-04-07 2137304 \n", - "5173 GHA.9.9_1 11796 2016-04-08 2137708 \n", - "5310 GHA.9.9_1 11770 2016-04-09 2137087 \n", - "5447 GHA.9.9_1 11864 2016-04-10 2135913 \n", - "5584 GHA.9.9_1 11849 2016-04-11 2137014 \n", - "\n", - " scaled_subscriber_count scaled_subscriber_count_baseline \n", - "0 0.030391 0.030399 \n", - "137 0.030472 0.030399 \n", - "274 0.030372 0.030399 \n", - "411 0.030394 0.030399 \n", - "548 0.030490 0.030399 \n", - "... ... ... \n", - "5036 0.005527 0.004555 \n", - "5173 0.005518 0.004555 \n", - "5310 0.005507 0.004555 \n", - "5447 0.005555 0.004555 \n", - "5584 0.005545 0.004555 \n", - "\n", - "[5585 rows x 6 columns]" - ] - }, - "execution_count": 183, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "merged_subscriber_counts = merged_subscriber_counts.merge(\n", - " scaled_subscriber_count_baseline,\n", - " left_on=\"pcod\",\n", + "districts_per_region_stats[\"average_districts_per_subscriber\"] = (\n", + " districts_per_region_stats.sum_district_subscriber_counts\n", + " / districts_per_region_stats.region_subscriber_count\n", + ")\n", + "\n", + "districts_per_region_stats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the average districts per subscriber over time for all regions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "districts_per_region_stats.pivot(\n", + " index=\"date\", columns=\"region_pcod\", values=\"average_districts_per_subscriber\"\n", + ").plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that in all cases the average districts per subscriber is very close to 1, so levels of movement were already quite low before the mobility restrictions. After the mobility restrictions, the value in GHA.1_1 (Ashanti) drops lower (so the subscribers remaining in Ashanti are not moving around the region as much) while the values for all other regions show only a small increase." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Percentage change in subscriber movements between districts\n", + "\n", + "Finally, we can look at the changes in subscriber movements between districts. To do this, we will use the \"district-level OD matrix per day\" aggregate, which counts the number of subscribers seen in both of each pair of regions per day. We will calculate the percentage change in subscriber movements, using a process very similar to the one we used for the \"percentage change in subscriber presence per day\" indicator:\n", + "\n", + "1. Scale the OD matrix subscriber count for each pair of districts by the total subscriber count for that day, to correct for the effects of changes in calling behaviour.\n", + "2. Calculate the median \"scaled subscriber count\" for each pair of districts over the baseline period,\n", + "3. For each pair of districts each day, calculate the percentage change in scaled subscriber count relative to the baseline.\n", + "\n", + "We start by merging the OD matrix DataFrame with the \"total subscribers per day\" DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "od_merged_subscriber_counts = od_matrix_district_per_day_results.merge(\n", + " total_subscribers_per_day_results.rename(columns={\"value\": \"total_subscribers\"}),\n", + " on=\"date\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we divide the subscriber count for each pair of districts by the total subscriber count for that day, to get the \"scaled subscriber count\":" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "od_merged_subscriber_counts[\"scaled_subscriber_count\"] = (\n", + " od_merged_subscriber_counts.value / od_merged_subscriber_counts.total_subscribers\n", + ")\n", + "\n", + "od_merged_subscriber_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now calculate the median \"scaled subscriber count\" for each pair of district over the baseline period:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "od_scaled_subscriber_count_baseline = (\n", + " od_merged_subscriber_counts[\n", + " (od_merged_subscriber_counts.date >= baseline_start)\n", + " & (od_merged_subscriber_counts.date < baseline_end)\n", + " ]\n", + " .groupby([\"pcod_from\", \"pcod_to\"])\n", + " .median()\n", + " .scaled_subscriber_count\n", + ")\n", + "od_scaled_subscriber_count_baseline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now merge the baseline values back into the `od_merged_subscriber_counts` DataFrame, joining on the \"pcod_from\" and \"pcod_to\" columns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "od_merged_subscriber_counts = od_merged_subscriber_counts.merge(\n", + " od_scaled_subscriber_count_baseline,\n", + " left_on=[\"pcod_from\", \"pcod_to\"],\n", " right_index=True,\n", - " suffixes=(\"\", \"_baseline\")\n", + " suffixes=(\"\", \"_baseline\"),\n", ")\n", "\n", - "merged_subscriber_counts" + "od_merged_subscriber_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, calculate the percentage change from the baseline value, for each pair of districts each day:" ] }, { "cell_type": "code", - "execution_count": 184, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pcodvaluedatetotal_subscribersscaled_subscriber_countscaled_subscriber_count_baselinepercent_change
0GHA.10.10_1649822016-02-0321382070.0303910.030399-0.027144
137GHA.10.10_1651102016-02-0421366910.0304720.0303990.240851
274GHA.10.10_1649112016-02-0721371920.0303720.030399-0.088948
411GHA.10.10_1649482016-02-1021368680.0303940.030399-0.016840
548GHA.10.10_1651592016-02-1121370270.0304900.0303990.300517
........................
5036GHA.9.9_1118122016-04-0721373040.0055270.00455521.333545
5173GHA.9.9_1117962016-04-0821377080.0055180.00455521.146293
5310GHA.9.9_1117702016-04-0921370870.0055070.00455520.914395
5447GHA.9.9_1118642016-04-1021359130.0055550.00455521.947058
5584GHA.9.9_1118492016-04-1121370140.0055450.00455521.730128
\n", - "

5585 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " pcod value date total_subscribers \\\n", - "0 GHA.10.10_1 64982 2016-02-03 2138207 \n", - "137 GHA.10.10_1 65110 2016-02-04 2136691 \n", - "274 GHA.10.10_1 64911 2016-02-07 2137192 \n", - "411 GHA.10.10_1 64948 2016-02-10 2136868 \n", - "548 GHA.10.10_1 65159 2016-02-11 2137027 \n", - "... ... ... ... ... \n", - "5036 GHA.9.9_1 11812 2016-04-07 2137304 \n", - "5173 GHA.9.9_1 11796 2016-04-08 2137708 \n", - "5310 GHA.9.9_1 11770 2016-04-09 2137087 \n", - "5447 GHA.9.9_1 11864 2016-04-10 2135913 \n", - "5584 GHA.9.9_1 11849 2016-04-11 2137014 \n", - "\n", - " scaled_subscriber_count scaled_subscriber_count_baseline \\\n", - "0 0.030391 0.030399 \n", - "137 0.030472 0.030399 \n", - "274 0.030372 0.030399 \n", - "411 0.030394 0.030399 \n", - "548 0.030490 0.030399 \n", - "... ... ... \n", - "5036 0.005527 0.004555 \n", - "5173 0.005518 0.004555 \n", - "5310 0.005507 0.004555 \n", - "5447 0.005555 0.004555 \n", - "5584 0.005545 0.004555 \n", - "\n", - " percent_change \n", - "0 -0.027144 \n", - "137 0.240851 \n", - "274 -0.088948 \n", - "411 -0.016840 \n", - "548 0.300517 \n", - "... ... \n", - "5036 21.333545 \n", - "5173 21.146293 \n", - "5310 20.914395 \n", - "5447 21.947058 \n", - "5584 21.730128 \n", - "\n", - "[5585 rows x 7 columns]" - ] - }, - "execution_count": 184, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "merged_subscriber_counts[\"percent_change\"] = (\n", - " merged_subscriber_counts.scaled_subscriber_count / merged_subscriber_counts.scaled_subscriber_count_baseline - 1\n", + "od_merged_subscriber_counts[\"percent_change\"] = (\n", + " od_merged_subscriber_counts.scaled_subscriber_count\n", + " / od_merged_subscriber_counts.scaled_subscriber_count_baseline\n", + " - 1\n", ") * 100\n", - "merged_subscriber_counts" + "od_merged_subscriber_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To show the percentage changes on a map, let's calculate the median percent changes over the \"stable period\":" ] }, { "cell_type": "code", - "execution_count": 190, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(-100.0, 31.561113800097285)" - ] - }, - "execution_count": 190, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Choose a few districts to show\n", - "display_districts = {\n", - " \"Accra\": \"GHA.5.1_1\",\n", - " \"Kumasi\": \"GHA.1.16_1\",\n", - " \"Tamale\": \"GHA.6.13_1\",\n", - "}\n", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "median_od_subscriber_count_percent_change = (\n", + " od_merged_subscriber_counts[od_merged_subscriber_counts.date >= stable_period_start]\n", + " .groupby([\"pcod_from\", \"pcod_to\"])\n", + " .median()\n", + " .percent_change.reset_index()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We could choose a district, and show the median percentage changes in movements from that district to all other districts. For this, we need to join the percentage changes data to the district boundaries, using the \"pcod_to\" column.\n", "\n", - "# Plot % change over time for the chosen districts\n", - "plt.figure()\n", - "for name, pcod in display_districts.items():\n", - " merged_subscriber_counts[merged_subscriber_counts.pcod == pcod].plot(\n", - " ax=plt.gca(), x=\"date\", y=\"percent_change\", marker=\".\", ls=\"\", label=name\n", - " )\n", - "plt.axhline(0, ls=\":\", c='k')\n", - "plt.ylim(bottom=-100)" + "**Note:** If we wanted to show movements in the other direction (i.e. movements from all districts to a chosen district), we would join to the district boundaries using the \"pcod_from\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Join to geo (on the \"to\" location)\n", + "median_od_percent_change_with_geo = districts_gdf.merge(\n", + " median_od_subscriber_count_percent_change, left_on=\"pcod\", right_on=\"pcod_to\",\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Average admin2 per subscriber in each admin1\n", + "Let's plot the percent change in movements from each of Accra, Kumasi and Tamale:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_districts = {\n", + " \"Accra\": \"GHA.5.1_1\",\n", + " \"Kumasi\": \"GHA.1.16_1\",\n", + " \"Tamale\": \"GHA.6.13_1\",\n", + "}\n", "\n", - "- Add admin1 pcode column to admin2 counts\n", - "- Sum admin2 counts per admin1\n", - "- Join to admin1 counts\n", - "- Divide\n", - "- Plot over time for all admin1 regions" + "for name, pcod in display_districts.items():\n", + " median_od_percent_change_with_geo[\n", + " median_od_percent_change_with_geo.pcod_from == pcod\n", + " ].plot(\n", + " column=\"percent_change\",\n", + " legend=True,\n", + " cmap=\"Spectral\",\n", + " norm=plt.Normalize(vmin=-100, vmax=100),\n", + " legend_kwds={\"label\": name},\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Scaled OD matrix counts\n", - "- Same process as for scaled admin2 counts\n", - "- Show a map of before vs after for either Accra or Kumasi (not sure which will look better)" + "From Accra and Tamale, movements to the Ashanti region have decreased, while movements to other districts have increased slightly. For Kumasi, movements to all other districts have decreased, especially to the rest of Ashanti (reflecting the smaller number of subscribers remaining in the region)." ] }, { @@ -2238,9 +1613,13 @@ "source": [ "### Summary\n", "\n", - "- What we did\n", - "- What it showed\n", - "- Maybe some advice on how this could be modified to do other things" + "In this example, we used FlowKit to extract aggregates from CDR data, and used this to produce mobility indicators. Before producing the indicators, we performed some QA checks on the aggregates to check that there were no issues with the underlying CDR data that could affect the validity of our mobility insights.\n", + "\n", + "We discovered that there was a large decrease in subscriber presence in the Ashanti region from 1 March 2016, followed by a small recovery on 10 March 2016, and a corresponding increase in subscriber presence in the rest of the country (so it appears that there was a large displacement of people from the Ashanti region to the rest of the country).\n", + "\n", + "**Note:** This example is built on a synthetic dataset, which is not intended to represent the effects of real-life mobility restrictions.\n", + "\n", + "In the real-world scenario of an ongoing disease epidemic, the effects of mobility restrictions could be monitored in near-real-time by updating these indicators every few days, provided regular updates of CDR data are being added to the FlowKit server. For more infirmation on the use of CDR data to monitor mobility during a disease epidemic, visit Flowminder's [COVID-19 website](https://covid19.flowminder.org/)." ] } ], From 72baa507882e808b44397e7fe790b0b089bacf24 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sat, 1 Aug 2020 20:23:14 +0100 Subject: [PATCH 3/3] Refer to example from index --- index.ipynb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index.ipynb b/index.ipynb index 285346f..5ca74c8 100644 --- a/index.ipynb +++ b/index.ipynb @@ -51,7 +51,9 @@ "1. [Getting started with FlowClient](01-getting-started-with-flowclient.ipynb) - this tutorial will teach you how to connect to a Flowkit server, \n", " 1a. [Getting an access token](01a-getting-an-access-token.ipynb) - instructions for getting an access token from FlowAuth, which you will use in tutorial 1, \n", "2. [Running a query](02-running-a-query.ipynb) - this tutorial will teach you how to use FlowClient to get aggregated data from FlowKit, \n", - "3. [Geography](03-geography.ipynb) - this tutorial will teach you how to join query results to geographic boundaries. " + "3. [Geography](03-geography.ipynb) - this tutorial will teach you how to join query results to geographic boundaries. \n", + "\n", + "Once you have worked through these, you could take a look at the [\"Mobility indicators\" worked example](mobility-indicators.ipynb) for a demonstration of some analysis using FlowKit aggregates." ] } ],