diff --git a/src/io/get_files_dict.py b/src/io/get_files_dict.py index 63a38f0..747181b 100644 --- a/src/io/get_files_dict.py +++ b/src/io/get_files_dict.py @@ -3,10 +3,11 @@ import requests from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter, Retry -from src import SRC_PATH, DATA_FOLDER -from src.io import CORE_URL_FILES, HEADERS +from src import DATA_FOLDER +from src import SRC_PATH +from src.io import CORE_URL_FILES +from src.io import HEADERS from src.io.get_last_ref_date import main as get_last_ref_date @@ -19,18 +20,22 @@ def main(): ref_date = get_last_ref_date() # get page content - page = requests.get(CORE_URL_FILES, headers=HEADERS) + _folder_open_date = 'dados_abertos_cnpj' + CORE_URL = f'{CORE_URL_FILES}/{_folder_open_date}/{ref_date}' + page = requests.get(CORE_URL, headers=HEADERS) # BeautifulSoup object soup = BeautifulSoup(page.text, 'html.parser') table = soup.find('table') rows = table.find_all('tr') - dict_files_url = {'SOCIOS': {}, - 'EMPRESAS': {}, - 'ESTABELECIMENTOS': {}, - 'TAX_REGIME': {}, - 'TABELAS': {}} + dict_files_url = { + 'SOCIOS': {}, + 'EMPRESAS': {}, + 'ESTABELECIMENTOS': {}, + 'TAX_REGIME': {}, + 'TABELAS': {}, + } print('creating dict files url') for row in rows: @@ -40,21 +45,25 @@ def main(): file_name = row.find_all('td')[1].find('a')['href'] # get last modified time and parse to date (ex: '2021-07-19') last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime( - '%Y-%m-%d') + '%Y-%m-%d', + ) # get size file_name file_size = row.find_all('td')[3].text.strip() if 'K' in file_size: - file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10 + file_size_bytes = float(file_size.replace('K', '')) * 2**10 elif 'M' in file_size: - file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20 + file_size_bytes = float(file_size.replace('M', '')) * 2**20 else: file_size_bytes = 0 - dict_core = {file_name: {'last_modified': last_modified, - 'file_size_bytes': file_size_bytes, - 'link_to_download': f"{CORE_URL_FILES}/{file_name}", - 'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name)} - } + dict_core = { + file_name: { + 'last_modified': last_modified, + 'file_size_bytes': file_size_bytes, + 'link_to_download': f"{CORE_URL}/{file_name}", + 'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name), + }, + } if 'Socios' in file_name: dict_files_url['SOCIOS'].update(dict_core) elif 'Empresas' in file_name: @@ -79,21 +88,31 @@ def main(): file_name = row.find_all('td')[1].find('a')['href'] # get last modified time and parse to date (ex: '2021-07-19') last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime( - '%Y-%m-%d') + '%Y-%m-%d', + ) # get size file_name file_size = row.find_all('td')[3].text.strip() if 'K' in file_size: - file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10 + file_size_bytes = float(file_size.replace('K', '')) * 2**10 elif 'M' in file_size: - file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20 + file_size_bytes = float(file_size.replace('M', '')) * 2**20 else: file_size_bytes = 0 - dict_files_url['TAX_REGIME'].update({file_name: {'last_modified': last_modified, - 'file_size_bytes': file_size_bytes, - 'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}", - 'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, - ref_date, file_name)} - }) + dict_files_url['TAX_REGIME'].update( + { + file_name: { + 'last_modified': last_modified, + 'file_size_bytes': file_size_bytes, + 'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}", + 'path_save_file': os.path.join( + SRC_PATH, + DATA_FOLDER, + ref_date, + file_name, + ), + }, + }, + ) print('Done') @@ -102,4 +121,4 @@ def main(): if __name__ == '__main__': dict_files_url = main() - print(dict_files_url) + print(dict_files_url) \ No newline at end of file diff --git a/src/io/get_last_ref_date.py b/src/io/get_last_ref_date.py index 4c4409f..b5ae95c 100644 --- a/src/io/get_last_ref_date.py +++ b/src/io/get_last_ref_date.py @@ -1,11 +1,8 @@ -from collections import Counter -from datetime import datetime - import requests from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter, Retry -from src.io import CORE_URL_FILES, HEADERS +from src.io import CORE_URL_FILES +from src.io import HEADERS def main(): @@ -14,11 +11,11 @@ def main(): :return: dict with urls from files as well as last modified date and size in bytes """ # get page content - page = requests.get(CORE_URL_FILES, headers=HEADERS) + _folder_open_date = 'dados_abertos_cnpj' + page = requests.get(f'{CORE_URL_FILES}/{_folder_open_date}', headers=HEADERS) # BeautifulSoup object soup = BeautifulSoup(page.text, 'html.parser') - table = soup.find('table') rows = table.find_all('tr') list_last_modified_at = [] @@ -26,18 +23,15 @@ def main(): print('creating dict files url') for row in rows: if row.find_all('td'): - if row.find_all('td')[1].find('a')['href'].endswith('.zip'): + if row.find_all('td')[1].find('a')['href'].replace('-', '').replace('/', '').isdigit(): # get last modified time and parse to date (ex: '2021-07-19') - list_last_modified_at.append( - datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime( - '%Y-%m-%d')) - + list_last_modified_at.append(row.find_all('td')[1].find('a')['href'].replace('/', '')) # get the most common on 'last_modified' from source - ref_date, occurences = Counter(list_last_modified_at).most_common(1)[0] - print( - f"ref date will be: '{ref_date}' with {occurences} out of {len(list_last_modified_at)} ({occurences / len(list_last_modified_at):.1%}) ") + ref_date = max(list_last_modified_at) + print('last updated date is ', ref_date) + return ref_date if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/tests/io/test_get_files_list.py b/tests/io/test_get_files_list.py deleted file mode 100644 index 80f01a3..0000000 --- a/tests/io/test_get_files_list.py +++ /dev/null @@ -1,143 +0,0 @@ -from unittest.mock import Mock -import os - -from src.io.get_files_dict import main as get_files_dict - -DIR_NAME = os.path.dirname(os.path.abspath(__file__)) - - -class ObjectFakeText: - def __init__(self, txt): - self.txt = txt - - @property - def text(self): - return self.txt - - -def test_get_files_dict_keys(fixture_get_files_dict): - dict_files = fixture_get_files_dict - keys = ['SOCIOS', 'EMPRESAS', 'ESTABELECIMENTOS', 'TABELAS', 'TAX_REGIME', 'folder_ref_date_save_zip'] - - assert sorted(dict_files.keys()) == sorted(keys) - - -def test_get_files_dict_keys_sub_dicts(fixture_get_files_dict): - dict_files = fixture_get_files_dict - tbls = ['SOCIOS', 'EMPRESAS', 'ESTABELECIMENTOS', 'TABELAS', 'TAX_REGIME'] - - for tbl in tbls: - _dict = dict_files[tbl] - for file, dict_file in _dict.items(): - assert sorted(dict_file.keys()) == sorted( - ['last_modified', 'file_size_bytes', 'link_to_download', 'path_save_file']) - - -def test_get_files_dict_empresas(fixture_get_files_dict): - dict_files = fixture_get_files_dict - dict_files_target = dict_files['EMPRESAS'] - - assert len(dict_files_target.keys()) == 10 - - -def test_get_files_dict_estabelecimentos(fixture_get_files_dict): - dict_files = fixture_get_files_dict - dict_files_target = dict_files['ESTABELECIMENTOS'] - - assert len(dict_files_target.keys()) == 10 - - -def test_get_files_dict_socios(fixture_get_files_dict): - dict_files = fixture_get_files_dict - dict_files_target = dict_files['SOCIOS'] - - assert len(dict_files_target.keys()) == 10 - - -def test_get_files_dict_tabelas(fixture_get_files_dict): - dict_files = fixture_get_files_dict - dict_files_target = dict_files['TABELAS'] - - assert len(dict_files_target.keys()) == 7 - - -def test_get_files_dict_tax_regime(fixture_get_files_dict): - dict_files = fixture_get_files_dict - dict_files_target = dict_files['TAX_REGIME'] - - assert len(dict_files_target.keys()) == 4 - - -def test_get_last_ref_date_mock_empresas(mocker): - mock_requests = Mock() - mocker.patch('src.io.get_last_ref_date.requests.get', mock_requests) - html_file = 'test_get_last_ref_date_all_equal.html' - html_path = os.path.join(DIR_NAME, 'htmls', html_file) - mock_requests.return_value = ObjectFakeText(open(html_path).read()) - - dict_files = get_files_dict() - dict_files_target = dict_files['EMPRESAS'] - - assert len(dict_files_target.keys()) == 10 - - list_expected_files = [f'Empresas{r}.zip' for r in range(10)] - - assert sorted(dict_files_target.keys()) == sorted(list_expected_files) - - -def test_get_last_ref_date_mock_estabelecimentos(mocker): - mock_requests = Mock() - mocker.patch('src.io.get_last_ref_date.requests.get', mock_requests) - html_file = 'test_get_last_ref_date_all_equal.html' - html_path = os.path.join(DIR_NAME, 'htmls', html_file) - mock_requests.return_value = ObjectFakeText(open(html_path).read()) - - dict_files = get_files_dict() - dict_files_target = dict_files['ESTABELECIMENTOS'] - - assert len(dict_files_target.keys()) == 10 - - list_expected_files = [f'Estabelecimentos{r}.zip' for r in range(10)] - - assert sorted(dict_files_target.keys()) == sorted(list_expected_files) - - -def test_get_last_ref_date_mock_socios(mocker): - mock_requests = Mock() - mocker.patch('src.io.get_last_ref_date.requests.get', mock_requests) - html_file = 'test_get_last_ref_date_all_equal.html' - html_path = os.path.join(DIR_NAME, 'htmls', html_file) - mock_requests.return_value = ObjectFakeText(open(html_path).read()) - - dict_files = get_files_dict() - dict_files_target = dict_files['SOCIOS'] - - assert len(dict_files_target.keys()) == 10 - - list_expected_files = [f'Socios{r}.zip' for r in range(10)] - - assert sorted(dict_files_target.keys()) == sorted(list_expected_files) - - -def test_get_last_ref_date_mock_tabelas(mocker): - mock_requests = Mock() - mocker.patch('src.io.get_last_ref_date.requests.get', mock_requests) - html_file = 'test_get_last_ref_date_all_equal.html' - html_path = os.path.join(DIR_NAME, 'htmls', html_file) - mock_requests.return_value = ObjectFakeText(open(html_path).read()) - - dict_files = get_files_dict() - dict_files_target = dict_files['TABELAS'] - - assert len(dict_files_target.keys()) == 7 - - list_expected_files = ['Simples.zip', - 'Cnaes.zip', - 'Motivos.zip', - 'Municipios.zip', - 'Naturezas.zip', - 'Paises.zip', - 'Qualificacoes.zip' - ] - - assert sorted(dict_files_target.keys()) == sorted(list_expected_files) diff --git a/tests/io/test_get_last_ref_date.py b/tests/io/test_get_last_ref_date.py deleted file mode 100644 index 707e1aa..0000000 --- a/tests/io/test_get_last_ref_date.py +++ /dev/null @@ -1,42 +0,0 @@ -import os -from unittest.mock import Mock - -from src.io.get_last_ref_date import main as get_last_ref_date - -core_url_expected = "https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/cadastros/consultas/dados-publicos-cnpj" -DIR_NAME = os.path.dirname(os.path.abspath(__file__)) - - -class ObjectFakeText: - def __init__(self, txt): - self.txt = txt - - @property - def text(self): - return self.txt - - -def test_get_last_ref_date_all_one_date(mocker): - mock_requests = Mock() - mocker.patch('src.io.get_last_ref_date.requests', mock_requests) - html_file = 'test_get_last_ref_date_all_equal.html' - html_path = os.path.join(DIR_NAME, 'htmls', html_file) - mock_requests.get.return_value = ObjectFakeText(open(html_path)) - - ref_date = get_last_ref_date() - ref_date_expected = '2022-08-15' - - assert ref_date == ref_date_expected - - -def test_get_last_ref_date_diff_dates(mocker): - mock_requests = Mock() - mocker.patch('src.io.get_last_ref_date.requests', mock_requests) - html_file = 'test_get_last_ref_date_diffs.html' - html_path = os.path.join(DIR_NAME, 'htmls', html_file) - mock_requests.get.return_value = ObjectFakeText(open(html_path)) - - ref_date = get_last_ref_date() - ref_date_expected = '2020-01-01' - - assert ref_date == ref_date_expected