Skip to content

Commit

Permalink
fix: fix how files are downloaded
Browse files Browse the repository at this point in the history
  • Loading branch information
Robso-creator committed Nov 22, 2024
1 parent aa28a42 commit acd5a2f
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 228 deletions.
73 changes: 46 additions & 27 deletions src/io/get_files_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

from src import SRC_PATH, DATA_FOLDER
from src.io import CORE_URL_FILES, HEADERS
from src import DATA_FOLDER
from src import SRC_PATH
from src.io import CORE_URL_FILES
from src.io import HEADERS
from src.io.get_last_ref_date import main as get_last_ref_date


Expand All @@ -19,18 +20,22 @@ def main():
ref_date = get_last_ref_date()

# get page content
page = requests.get(CORE_URL_FILES, headers=HEADERS)
_folder_open_date = 'dados_abertos_cnpj'
CORE_URL = f'{CORE_URL_FILES}/{_folder_open_date}/{ref_date}'
page = requests.get(CORE_URL, headers=HEADERS)

# BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')

table = soup.find('table')
rows = table.find_all('tr')
dict_files_url = {'SOCIOS': {},
'EMPRESAS': {},
'ESTABELECIMENTOS': {},
'TAX_REGIME': {},
'TABELAS': {}}
dict_files_url = {
'SOCIOS': {},
'EMPRESAS': {},
'ESTABELECIMENTOS': {},
'TAX_REGIME': {},
'TABELAS': {},
}

print('creating dict files url')
for row in rows:
Expand All @@ -40,21 +45,25 @@ def main():
file_name = row.find_all('td')[1].find('a')['href']
# get last modified time and parse to date (ex: '2021-07-19')
last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
'%Y-%m-%d')
'%Y-%m-%d',
)
# get size file_name
file_size = row.find_all('td')[3].text.strip()
if 'K' in file_size:
file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10
file_size_bytes = float(file_size.replace('K', '')) * 2**10
elif 'M' in file_size:
file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20
file_size_bytes = float(file_size.replace('M', '')) * 2**20
else:
file_size_bytes = 0

dict_core = {file_name: {'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL_FILES}/{file_name}",
'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name)}
}
dict_core = {
file_name: {
'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL}/{file_name}",
'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name),
},
}
if 'Socios' in file_name:
dict_files_url['SOCIOS'].update(dict_core)
elif 'Empresas' in file_name:
Expand All @@ -79,21 +88,31 @@ def main():
file_name = row.find_all('td')[1].find('a')['href']
# get last modified time and parse to date (ex: '2021-07-19')
last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
'%Y-%m-%d')
'%Y-%m-%d',
)
# get size file_name
file_size = row.find_all('td')[3].text.strip()
if 'K' in file_size:
file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10
file_size_bytes = float(file_size.replace('K', '')) * 2**10
elif 'M' in file_size:
file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20
file_size_bytes = float(file_size.replace('M', '')) * 2**20
else:
file_size_bytes = 0
dict_files_url['TAX_REGIME'].update({file_name: {'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}",
'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER,
ref_date, file_name)}
})
dict_files_url['TAX_REGIME'].update(
{
file_name: {
'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}",
'path_save_file': os.path.join(
SRC_PATH,
DATA_FOLDER,
ref_date,
file_name,
),
},
},
)

print('Done')

Expand All @@ -102,4 +121,4 @@ def main():

if __name__ == '__main__':
dict_files_url = main()
print(dict_files_url)
print(dict_files_url)
26 changes: 10 additions & 16 deletions src/io/get_last_ref_date.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from collections import Counter
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

from src.io import CORE_URL_FILES, HEADERS
from src.io import CORE_URL_FILES
from src.io import HEADERS


def main():
Expand All @@ -14,30 +11,27 @@ def main():
:return: dict with urls from files as well as last modified date and size in bytes
"""
# get page content
page = requests.get(CORE_URL_FILES, headers=HEADERS)
_folder_open_date = 'dados_abertos_cnpj'
page = requests.get(f'{CORE_URL_FILES}/{_folder_open_date}', headers=HEADERS)

# BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')

table = soup.find('table')
rows = table.find_all('tr')
list_last_modified_at = []

print('creating dict files url')
for row in rows:
if row.find_all('td'):
if row.find_all('td')[1].find('a')['href'].endswith('.zip'):
if row.find_all('td')[1].find('a')['href'].replace('-', '').replace('/', '').isdigit():
# get last modified time and parse to date (ex: '2021-07-19')
list_last_modified_at.append(
datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
'%Y-%m-%d'))

list_last_modified_at.append(row.find_all('td')[1].find('a')['href'].replace('/', ''))
# get the most common on 'last_modified' from source
ref_date, occurences = Counter(list_last_modified_at).most_common(1)[0]
print(
f"ref date will be: '{ref_date}' with {occurences} out of {len(list_last_modified_at)} ({occurences / len(list_last_modified_at):.1%}) ")
ref_date = max(list_last_modified_at)
print('last updated date is ', ref_date)

return ref_date


if __name__ == '__main__':
main()
main()
143 changes: 0 additions & 143 deletions tests/io/test_get_files_list.py

This file was deleted.

42 changes: 0 additions & 42 deletions tests/io/test_get_last_ref_date.py

This file was deleted.

0 comments on commit acd5a2f

Please sign in to comment.