Skip to content

Commit

Permalink
Merge pull request #1 from eDatos/new-indicators
Browse files Browse the repository at this point in the history
New indicators
  • Loading branch information
sdelquin authored May 24, 2021
2 parents fbc025a + df69c68 commit d22c472
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 76 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
.ipynb_checkpoints
sandbox.ipynb
.env
codelist.xlsx
dataref.xlsx
gdrive-credentials.json
gdrive-secrets.json
gcs-credentials.json
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ Versions follow [Semantic Versioning](https://semver.org/) (`<major>.<minor>.<pa
Released XXXX-XX-XX

- Fix CI GitHub workflow.
- Include new indicators.
- Direct scraping from datasets urls on dataref.
- Filter datasets with aumented NUTs.

## Version 0.6.1

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

![siemactk-workflow](img/siemactk-workflow.png)

1. Scrap some datasets from https://ec.europa.eu/eurostat.
2. Get codelist from Google Spreadsheets.
1. Get datasets urls and codelists from Google Spreadsheets.
2. Scrap these datasets from https://ec.europa.eu/eurostat.
3. Filter, clean & recode datasets.
4. Generate translated output files in both `.json` and `.tsv` formats.
5. Upload output files to a bucket in Google Cloud Storage (GCS).
Expand Down
32 changes: 17 additions & 15 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
from collections import defaultdict

import notification
import scraping
import settings
import storage
import wrangling
import notification
from collections import defaultdict

print('Downloading codelist...')
storage.download_codelist(settings.CODELIST_FILENAME)
print('Downloading dataref file...')
codelists, datasets_urls = storage.download_dataref()

print('Converting codelists to dict mapping...')
codelists = wrangling.codelists_to_dict(codelists)

uploaded_files = defaultdict(list)

for dataset_url in scraping.get_datasets_urls(settings.TARGET_URL):
for dataset_url in datasets_urls:
print(f'Downloading {dataset_url}...')
dataset = scraping.download_dataset(dataset_url)

print(f'Staging {dataset}...')
output_files = wrangling.stage_dataset(dataset)

print('Uploading output files...')
for file in output_files:
download_url = storage.upload(file)
filename = file.name
uploaded_files[dataset.stem].append((filename, download_url))
print(f"{filename} -> {download_url}")
print()
if output_files := wrangling.stage_dataset(dataset, codelists):
print('Uploading output files...')
for file in output_files:
download_url = storage.upload(file)
filename = file.name
uploaded_files[dataset.stem].append((filename, download_url))
print(f"{filename} -> {download_url}")
print()

print('Notifying results...')
notification.notify(uploaded_files)
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
selenium
pandas
prettyconf
yagdrive
Expand Down
27 changes: 0 additions & 27 deletions scraping.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,12 @@
import gzip
import os
import re
from pathlib import Path

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

import settings


def get_datasets_urls(target_url):
options = Options()
options.headless = True

driver = webdriver.Firefox(options=options)
driver.get(target_url)

element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#gepRegionIndicators .tipTable'))
)

items = element.find_elements_by_class_name('tipRow')
for item in items:
item_id = item.get_attribute('id')
dataset_code = re.match(r'tipRow_[\d-]+_(.*)', item_id).group(1)
dataset_table_url = os.path.join(
settings.BASE_DATASET_URL, dataset_code + '.tsv.gz'
)
yield dataset_table_url


def download_dataset(dataset_url, target_folder=settings.DATASETS_DIR):
target_folder = Path(target_folder)
target_folder.mkdir(parents=True, exist_ok=True)
Expand Down
32 changes: 18 additions & 14 deletions settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,29 @@

from prettyconf import config

TARGET_URL = config(
'TARGET_URL',
default='https://ec.europa.eu/eurostat/cache/RCI/myregion/'
'#?reg=ES70&ind=1-2_demo_r_d2jan',
)
BASE_DATASET_URL = config(
'BASE_DATASET_URL',
default='https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/'
'BulkDownloadListing?file=data/',
)

def geocodes_cast(value):
return [v.split('|') for v in value.split(',')]


DATASETS_DIR = config('DATASETS_DIR', default='data')
# Geocodes will be casted as a list of tuples.
# Datasets will be filtered out by each geocode within its group. For example, if you
# write 'ES7|ES70,PT2|PT20' it means 2 groups with 2 geocodes in each group.
# Selected rows comes from the maximum filtered rows between ES7 and ES70. The same
# occurs with PT2 and PT20, and so on.
TARGET_GEOCODES = config(
'TARGET_GEOCODES', default='ES70,PT20,PT30,EU27_2020', cast=config.list
'TARGET_GEOCODES', default='ES7|ES70,PT2|PT20,PT3|PT30,EU27_2020', cast=geocodes_cast
)
RECODING_LANGUAGES = config('RECODING_LANGUAGES', default='ES,PT', cast=config.list)

CODELIST_FILENAME = config('CODELIST_FILENAME', default='codelist.xlsx')
GDRIVE_CODELIST_ID = config('GDRIVE_CODELIST_ID')
CODELIST_RECODE_SHEET = config('CODELIST_RECODE_SHEET')
DATAREF_FILENAME = config('DATAREF_FILENAME', default='dataref.xlsx')
GDRIVE_DATAREF_ID = config('GDRIVE_DATAREF_ID')
DATAREF_CODELISTS_SHEET = config('DATAREF_CODELISTS_SHEET', default='Codelists')
DATAREF_INVENTORY_SHEET = config('DATAREF_INVENTORY_SHEET', default='Inventario Siemac')
DATAREF_INVENTORY_DATASET_URLS_COLUMN_NAME = config(
'DATAREF_INVENTORY_DATASET_URLS_COLUMN_NAME', default='URL Dataset'
)

GDRIVE_API_CREDENTIALS = config('GDRIVE_API_CREDENTIALS', default='gdrive-credentials.json')
GDRIVE_API_SECRETS = config('GDRIVE_API_SECRETS', default='gdrive-secrets.json')
Expand Down
15 changes: 13 additions & 2 deletions storage.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

import pandas as pd
from google.cloud import storage
from yagdrive import GDrive

Expand All @@ -13,8 +14,18 @@
bucket = gcs.get_bucket('siemac')


def download_codelist(filepath: str):
drive.get_by_id(settings.GDRIVE_CODELIST_ID, filepath)
def download_dataref(
file_id=settings.GDRIVE_DATAREF_ID,
filepath=settings.DATAREF_FILENAME,
codelists_sheet=settings.DATAREF_CODELISTS_SHEET,
inventory_sheet=settings.DATAREF_INVENTORY_SHEET,
inventory_dataset_urls_column_name=settings.DATAREF_INVENTORY_DATASET_URLS_COLUMN_NAME,
):
drive.get_by_id(file_id, filepath)
fh = Path(filepath)
codelists = pd.read_excel(fh, sheet_name=codelists_sheet, dtype=str)
inventory = pd.read_excel(fh, sheet_name=inventory_sheet)
return codelists, inventory[inventory_dataset_urls_column_name]


def upload(file: Path):
Expand Down
44 changes: 30 additions & 14 deletions wrangling.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
from pathlib import Path

import pandas as pd
from pandas.core.frame import DataFrame

import settings


def _filter_dataset(df: DataFrame, geocodes: list):
gc_pattern = ',(?:' + '|'.join(geocodes) + ') *$'
return df[df.iloc[:, 0].str.contains(gc_pattern, regex=True)]


def _clean_dataset(df: DataFrame):
def codelists_to_dict(codelists: pd.DataFrame, languages=settings.RECODING_LANGUAGES):
mapping = {}
for lang in languages:
mapping[lang] = {}
for _, group in codelists.groupby('cl'):
d = group.pivot(index='code', columns='cl', values=lang).to_dict()
mapping[lang].update(d)
return mapping


def _filter_dataset(df: pd.DataFrame, geocodes: list[tuple]):
filtered_df = pd.DataFrame()
for geocodes_group in geocodes:
aux = {}
for geocode in geocodes_group:
filtered_rows = df.iloc[:, 0].str.contains(fr'{geocode}\s*$', regex=True)
aux[sum(filtered_rows)] = df[filtered_rows]
filtered_rows = aux[max(aux)]
filtered_df = filtered_df.append(filtered_rows, ignore_index=True)
return filtered_df


def _clean_dataset(df: pd.DataFrame):
def clean_values(series):
series = series.str.replace(r'[ a-zA-Z:]+$', '', regex=True)
series = series.replace('.', ',')
Expand All @@ -27,15 +43,14 @@ def clean_values(series):
return pd.concat([id_df, df], axis=1, verify_integrity=True)


def _recode_dataset(df: DataFrame, cl: DataFrame, language: str):
mapping = cl.pivot(index='code', columns='cl', values=language).to_dict()
def _recode_dataset(df: pd.DataFrame, mapping: dict):
return df.replace(mapping)


def stage_dataset(
dataset: Path,
geocodes: list = settings.TARGET_GEOCODES,
codelist: Path = Path(settings.CODELIST_FILENAME),
codelists: dict,
geocodes: list[tuple] = settings.TARGET_GEOCODES,
languages: list = settings.RECODING_LANGUAGES,
):
"""Stage dataset. Steps:
Expand All @@ -45,17 +60,18 @@ def stage_dataset(
4. Recode dataset for the indicated languages.
5. Save dataset as csv (tsv) and json formats.
"""
# Load codelist
cl = pd.read_excel(codelist, sheet_name=settings.CODELIST_RECODE_SHEET)

df = pd.read_csv(dataset, sep='\t')
df = _filter_dataset(df, geocodes)
if df.size == 0:
print('Dataset has no records with supplied geocodes. Discarding...')
return False
df = _clean_dataset(df)

output_files = []

for lang in languages:
recoded_df = _recode_dataset(df, cl, lang)
recoded_df = _recode_dataset(df, codelists[lang])
output_stem = f'{dataset.stem}_{lang.lower()}'

output_file = dataset.with_name(output_stem + '.tsv')
Expand Down

0 comments on commit d22c472

Please sign in to comment.