From d757eda05767b0e2fb5af6e14c5e94eb35259f5d Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 08:29:06 +0100 Subject: [PATCH 01/11] Sort imports --- main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 4b695e6..39d3b56 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,10 @@ +from collections import defaultdict + +import notification import scraping import settings import storage import wrangling -import notification -from collections import defaultdict print('Downloading codelist...') storage.download_codelist(settings.CODELIST_FILENAME) From e7f36cfd457604049fe86eff0a5a7907056260a4 Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 11:23:22 +0100 Subject: [PATCH 02/11] Change data reference filename --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 850cbff..4fc5416 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ .ipynb_checkpoints sandbox.ipynb .env -codelist.xlsx +dataref.xlsx gdrive-credentials.json gdrive-secrets.json gcs-credentials.json From 0007ef4f83295052053e4bd7a7b879facdd98ba5 Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 11:23:49 +0100 Subject: [PATCH 03/11] Fix settings for data reference --- settings.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/settings.py b/settings.py index 674db0f..25da9c0 100644 --- a/settings.py +++ b/settings.py @@ -18,9 +18,13 @@ ) RECODING_LANGUAGES = config('RECODING_LANGUAGES', default='ES,PT', cast=config.list) -CODELIST_FILENAME = config('CODELIST_FILENAME', default='codelist.xlsx') -GDRIVE_CODELIST_ID = config('GDRIVE_CODELIST_ID') -CODELIST_RECODE_SHEET = config('CODELIST_RECODE_SHEET') +DATAREF_FILENAME = config('DATAREF_FILENAME', default='dataref.xlsx') +GDRIVE_DATAREF_ID = config('GDRIVE_DATAREF_ID') +DATAREF_CODELISTS_SHEET = config('DATAREF_CODELISTS_SHEET', default='Codelists') +DATAREF_INVENTORY_SHEET = config('DATAREF_INVENTORY_SHEET', default='Inventario Siemac') +DATAREF_INVENTORY_DATASET_URLS_COLUMN_NAME = config( + 'DATAREF_INVENTORY_DATASET_URLS_COLUMN_NAME', default='URL Dataset' +) GDRIVE_API_CREDENTIALS = config('GDRIVE_API_CREDENTIALS', default='gdrive-credentials.json') GDRIVE_API_SECRETS = config('GDRIVE_API_SECRETS', default='gdrive-secrets.json') From c065006ca8cbbc6cf48f65b03b1f025d2d52919c Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 11:24:40 +0100 Subject: [PATCH 04/11] Get codelists and datasets inventory within downloading --- storage.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/storage.py b/storage.py index 1fb95fe..a860404 100644 --- a/storage.py +++ b/storage.py @@ -1,5 +1,6 @@ from pathlib import Path +import pandas as pd from google.cloud import storage from yagdrive import GDrive @@ -13,8 +14,18 @@ bucket = gcs.get_bucket('siemac') -def download_codelist(filepath: str): - drive.get_by_id(settings.GDRIVE_CODELIST_ID, filepath) +def download_dataref( + file_id=settings.GDRIVE_DATAREF_ID, + filepath=settings.DATAREF_FILENAME, + codelists_sheet=settings.DATAREF_CODELISTS_SHEET, + inventory_sheet=settings.DATAREF_INVENTORY_SHEET, + inventory_dataset_urls_column_name=settings.DATAREF_INVENTORY_DATASET_URLS_COLUMN_NAME, +): + drive.get_by_id(file_id, filepath) + fh = Path(filepath) + codelists = pd.read_excel(fh, sheet_name=codelists_sheet) + inventory = pd.read_excel(fh, sheet_name=inventory_sheet) + return codelists, inventory[inventory_dataset_urls_column_name] def upload(file: Path): From 4fcb157fe3e9a51aae341ea993b8e6db2539529e Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 11:25:13 +0100 Subject: [PATCH 05/11] Refactor wrangling --- main.py | 12 +++++++----- wrangling.py | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 39d3b56..3e22777 100644 --- a/main.py +++ b/main.py @@ -2,21 +2,23 @@ import notification import scraping -import settings import storage import wrangling -print('Downloading codelist...') -storage.download_codelist(settings.CODELIST_FILENAME) +print('Downloading dataref file...') +codelists, datasets_urls = storage.download_dataref() + +print('Converting codelists to dict mapping...') +codelists = wrangling.codelists_to_dict(codelists) uploaded_files = defaultdict(list) -for dataset_url in scraping.get_datasets_urls(settings.TARGET_URL): +for dataset_url in datasets_urls: print(f'Downloading {dataset_url}...') dataset = scraping.download_dataset(dataset_url) print(f'Staging {dataset}...') - output_files = wrangling.stage_dataset(dataset) + output_files = wrangling.stage_dataset(dataset, codelists) print('Uploading output files...') for file in output_files: diff --git a/wrangling.py b/wrangling.py index 469aa7c..fda9f0b 100644 --- a/wrangling.py +++ b/wrangling.py @@ -6,6 +6,16 @@ import settings +def codelists_to_dict(codelists: DataFrame, languages=settings.RECODING_LANGUAGES): + mapping = {} + for lang in languages: + mapping[lang] = {} + for _, group in codelists.groupby('cl'): + d = group.pivot(index='code', columns='cl', values=lang).to_dict() + mapping[lang].update(d) + return mapping + + def _filter_dataset(df: DataFrame, geocodes: list): gc_pattern = ',(?:' + '|'.join(geocodes) + ') *$' return df[df.iloc[:, 0].str.contains(gc_pattern, regex=True)] @@ -27,15 +37,14 @@ def clean_values(series): return pd.concat([id_df, df], axis=1, verify_integrity=True) -def _recode_dataset(df: DataFrame, cl: DataFrame, language: str): - mapping = cl.pivot(index='code', columns='cl', values=language).to_dict() +def _recode_dataset(df: DataFrame, mapping: dict): return df.replace(mapping) def stage_dataset( dataset: Path, + codelists: dict, geocodes: list = settings.TARGET_GEOCODES, - codelist: Path = Path(settings.CODELIST_FILENAME), languages: list = settings.RECODING_LANGUAGES, ): """Stage dataset. Steps: @@ -45,8 +54,6 @@ def stage_dataset( 4. Recode dataset for the indicated languages. 5. Save dataset as csv (tsv) and json formats. """ - # Load codelist - cl = pd.read_excel(codelist, sheet_name=settings.CODELIST_RECODE_SHEET) df = pd.read_csv(dataset, sep='\t') df = _filter_dataset(df, geocodes) @@ -55,7 +62,7 @@ def stage_dataset( output_files = [] for lang in languages: - recoded_df = _recode_dataset(df, cl, lang) + recoded_df = _recode_dataset(df, codelists[lang]) output_stem = f'{dataset.stem}_{lang.lower()}' output_file = dataset.with_name(output_stem + '.tsv') From b9025e05012d78d0c8c4936d2d4f95e1f26afd1a Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 12:01:22 +0100 Subject: [PATCH 06/11] Discard datasets with no records after filtering --- main.py | 17 ++++++++--------- wrangling.py | 3 +++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 3e22777..8adcca2 100644 --- a/main.py +++ b/main.py @@ -18,15 +18,14 @@ dataset = scraping.download_dataset(dataset_url) print(f'Staging {dataset}...') - output_files = wrangling.stage_dataset(dataset, codelists) - - print('Uploading output files...') - for file in output_files: - download_url = storage.upload(file) - filename = file.name - uploaded_files[dataset.stem].append((filename, download_url)) - print(f"{filename} -> {download_url}") - print() + if output_files := wrangling.stage_dataset(dataset, codelists): + print('Uploading output files...') + for file in output_files: + download_url = storage.upload(file) + filename = file.name + uploaded_files[dataset.stem].append((filename, download_url)) + print(f"{filename} -> {download_url}") + print() print('Notifying results...') notification.notify(uploaded_files) diff --git a/wrangling.py b/wrangling.py index fda9f0b..3e291ce 100644 --- a/wrangling.py +++ b/wrangling.py @@ -57,6 +57,9 @@ def stage_dataset( df = pd.read_csv(dataset, sep='\t') df = _filter_dataset(df, geocodes) + if df.size == 0: + print('Dataset has no records with supplied geocodes. Discarding...') + return False df = _clean_dataset(df) output_files = [] From c72991eb558621ec850c292f2002880a136463fb Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 12:07:21 +0100 Subject: [PATCH 07/11] Get rid of unused code --- requirements.txt | 1 - scraping.py | 27 --------------------------- settings.py | 10 ---------- 3 files changed, 38 deletions(-) diff --git a/requirements.txt b/requirements.txt index e06e6e0..2903961 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -selenium pandas prettyconf yagdrive diff --git a/scraping.py b/scraping.py index 82839b7..ee6b126 100644 --- a/scraping.py +++ b/scraping.py @@ -1,39 +1,12 @@ import gzip -import os import re from pathlib import Path import requests -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait import settings -def get_datasets_urls(target_url): - options = Options() - options.headless = True - - driver = webdriver.Firefox(options=options) - driver.get(target_url) - - element = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.CSS_SELECTOR, '#gepRegionIndicators .tipTable')) - ) - - items = element.find_elements_by_class_name('tipRow') - for item in items: - item_id = item.get_attribute('id') - dataset_code = re.match(r'tipRow_[\d-]+_(.*)', item_id).group(1) - dataset_table_url = os.path.join( - settings.BASE_DATASET_URL, dataset_code + '.tsv.gz' - ) - yield dataset_table_url - - def download_dataset(dataset_url, target_folder=settings.DATASETS_DIR): target_folder = Path(target_folder) target_folder.mkdir(parents=True, exist_ok=True) diff --git a/settings.py b/settings.py index 25da9c0..8d81ccf 100644 --- a/settings.py +++ b/settings.py @@ -2,16 +2,6 @@ from prettyconf import config -TARGET_URL = config( - 'TARGET_URL', - default='https://ec.europa.eu/eurostat/cache/RCI/myregion/' - '#?reg=ES70&ind=1-2_demo_r_d2jan', -) -BASE_DATASET_URL = config( - 'BASE_DATASET_URL', - default='https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/' - 'BulkDownloadListing?file=data/', -) DATASETS_DIR = config('DATASETS_DIR', default='data') TARGET_GEOCODES = config( 'TARGET_GEOCODES', default='ES70,PT20,PT30,EU27_2020', cast=config.list From a05adec80b23b7ffc5f14be13a471b052ca8948e Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Thu, 20 May 2021 12:49:08 +0100 Subject: [PATCH 08/11] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 643a6c0..36325ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ Versions follow [Semantic Versioning](https://semver.org/) (`.. Date: Fri, 21 May 2021 08:34:51 +0100 Subject: [PATCH 09/11] Filter datasets with progressive geocode groups --- CHANGELOG.md | 1 + settings.py | 12 +++++++++++- wrangling.py | 22 ++++++++++++++-------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36325ec..736cd4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Released XXXX-XX-XX - Fix CI GitHub workflow. - Include new indicators. - Direct scraping from datasets urls on dataref. +- Filter datasets with aumented NUTs. ## Version 0.6.1 diff --git a/settings.py b/settings.py index 8d81ccf..794170d 100644 --- a/settings.py +++ b/settings.py @@ -2,9 +2,19 @@ from prettyconf import config + +def geocodes_cast(value): + return [v.split('|') for v in value.split(',')] + + DATASETS_DIR = config('DATASETS_DIR', default='data') +# Geocodes will be casted as a list of tuples. +# Datasets will be filtered out by each geocode within its group. For example, if you +# write 'ES7|ES70,PT2|PT20' it means 2 groups with 2 geocodes in each group. +# Selected rows comes from the maximum filtered rows between ES7 and ES70. The same +# occurs with PT2 and PT20, and so on. TARGET_GEOCODES = config( - 'TARGET_GEOCODES', default='ES70,PT20,PT30,EU27_2020', cast=config.list + 'TARGET_GEOCODES', default='ES7|ES70,PT2|PT20,PT3|PT30,EU27_2020', cast=geocodes_cast ) RECODING_LANGUAGES = config('RECODING_LANGUAGES', default='ES,PT', cast=config.list) diff --git a/wrangling.py b/wrangling.py index 3e291ce..7cf7da7 100644 --- a/wrangling.py +++ b/wrangling.py @@ -1,12 +1,11 @@ from pathlib import Path import pandas as pd -from pandas.core.frame import DataFrame import settings -def codelists_to_dict(codelists: DataFrame, languages=settings.RECODING_LANGUAGES): +def codelists_to_dict(codelists: pd.DataFrame, languages=settings.RECODING_LANGUAGES): mapping = {} for lang in languages: mapping[lang] = {} @@ -16,12 +15,19 @@ def codelists_to_dict(codelists: DataFrame, languages=settings.RECODING_LANGUAGE return mapping -def _filter_dataset(df: DataFrame, geocodes: list): - gc_pattern = ',(?:' + '|'.join(geocodes) + ') *$' - return df[df.iloc[:, 0].str.contains(gc_pattern, regex=True)] +def _filter_dataset(df: pd.DataFrame, geocodes: list[tuple]): + filtered_df = pd.DataFrame() + for geocodes_group in geocodes: + aux = {} + for geocode in geocodes_group: + filtered_rows = df.iloc[:, 0].str.contains(fr'{geocode}\s*$', regex=True) + aux[sum(filtered_rows)] = df[filtered_rows] + filtered_rows = aux[max(aux)] + filtered_df = filtered_df.append(filtered_rows, ignore_index=True) + return filtered_df -def _clean_dataset(df: DataFrame): +def _clean_dataset(df: pd.DataFrame): def clean_values(series): series = series.str.replace(r'[ a-zA-Z:]+$', '', regex=True) series = series.replace('.', ',') @@ -37,14 +43,14 @@ def clean_values(series): return pd.concat([id_df, df], axis=1, verify_integrity=True) -def _recode_dataset(df: DataFrame, mapping: dict): +def _recode_dataset(df: pd.DataFrame, mapping: dict): return df.replace(mapping) def stage_dataset( dataset: Path, codelists: dict, - geocodes: list = settings.TARGET_GEOCODES, + geocodes: list[tuple] = settings.TARGET_GEOCODES, languages: list = settings.RECODING_LANGUAGES, ): """Stage dataset. Steps: From d314f5566d1ce8465508ffacde4537025f5e85e8 Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Fri, 21 May 2021 11:57:56 +0100 Subject: [PATCH 10/11] Fix loading of codelists --- storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage.py b/storage.py index a860404..2a4aa7d 100644 --- a/storage.py +++ b/storage.py @@ -23,7 +23,7 @@ def download_dataref( ): drive.get_by_id(file_id, filepath) fh = Path(filepath) - codelists = pd.read_excel(fh, sheet_name=codelists_sheet) + codelists = pd.read_excel(fh, sheet_name=codelists_sheet, dtype=str) inventory = pd.read_excel(fh, sheet_name=inventory_sheet) return codelists, inventory[inventory_dataset_urls_column_name] From df69c68460614afa399599c4e57ddda5cb8b81e3 Mon Sep 17 00:00:00 2001 From: Sergio Delgado Quintero Date: Mon, 24 May 2021 09:01:10 +0100 Subject: [PATCH 11/11] Fix workflow on README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 24d10b0..8707a04 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ ![siemactk-workflow](img/siemactk-workflow.png) -1. Scrap some datasets from https://ec.europa.eu/eurostat. -2. Get codelist from Google Spreadsheets. +1. Get datasets urls and codelists from Google Spreadsheets. +2. Scrap these datasets from https://ec.europa.eu/eurostat. 3. Filter, clean & recode datasets. 4. Generate translated output files in both `.json` and `.tsv` formats. 5. Upload output files to a bucket in Google Cloud Storage (GCS).