Skip to content

Commit

Permalink
Merge pull request #17 from blasferna/15-extract-data-from-the-new-se…
Browse files Browse the repository at this point in the history
…t-page

Extract data from the new set page
  • Loading branch information
blasferna authored Nov 16, 2023
2 parents 8ae98ab + 2c54929 commit 88a723d
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 143 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
venv
data/tmp/*.*
12 changes: 8 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ jobs:
with:
python-version: 3.8

- name: Install UnRar
run: sudo apt-get install unrar
- name: Set up Java
uses: actions/setup-java@v2
with:
distribution: 'temurin'
java-version: '8'

- name: Install dependencies
run: |
python -m pip install -U pip
python -m pip install -U requests==2.25.1 beautifulsoup4==4.10.0 pandas==1.3.5 openpyxl==3.0.9
python -m pip install -U tabula==1.0.5
- name: Download files
run: |
Expand All @@ -37,7 +41,7 @@ jobs:
- name: Commit changes
uses: EndBug/add-and-commit@v7
with:
author_name: github-actions
author_email: github-actions@github.com
author_name: github-actions[bot]
author_email: github-actions[bot]@users.noreply.github.com
message: 'update ruc.zip'
add: '.'
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,4 @@ data/tmp/*.txt
data/tmp/*.zip
data/tmp/*.rar
data/tmp/*.xlsx
data/tmp/*.pdf
164 changes: 67 additions & 97 deletions data/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,80 +3,50 @@
import sqlite3
from zipfile import ZipFile

import pandas as pd
import requests
import tabula
from bs4 import BeautifulSoup

from utils import (
create_table_postgresql,
create_view_postgresql,
file_compress,
insert_postgresql,
insert_values,
)
from utils import insert_values

requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL:@SECLEVEL=1"
BASE_URL = "https://www.set.gov.py"
URL = f"{BASE_URL}/portal/PARAGUAY-SET/InformesPeriodicos?folder-id=repository:collaboration:/sites/PARAGUAY-SET/categories/SET/Informes%20Periodicos/listado-de-ruc-con-sus-equivalencias"
URL_PERSONAS_JURIDICAS = f"{BASE_URL}/portal/PARAGUAY-SET/InformesPeriodicos?folder-id=repository:collaboration:/sites/PARAGUAY-SET/categories/SET/Informes%20Periodicos/lista-de-contribuyentes-que-son-personas-juridicas"
URL_RUCS = f"{BASE_URL}/web/portal-institucional/listado-de-ruc-con-sus-equivalencias"
URL_PERSONAS_JURIDICAS = (
f"{BASE_URL}/web/portal-institucional/contribuyentes-que-son-personas-jurídicas"
)

values = []
values_list = []
pj = {}


def get_download_preview_links():
def get_ruc_download_links():
links = []
try:
soup = BeautifulSoup(
requests.get(
URL,
URL_RUCS,
timeout=10,
headers={"user-agent": "Mozilla/5.0"},
verify=True,
).text,
"html.parser",
)
list_div = soup.select(".uiContentBox")[0]
lits_rows = list_div.select(".media")
for row in lits_rows:
a = row.find("a")
d = {}
d[a["title"]] = a["href"]
links.append(d)

return links
items = soup.find_all("div", class_="list__item search-item")
for item in items:
title = item.find("h3", class_="item__title").text.strip()
download_link = item.find("a", class_="link")["href"]
links.append({title: download_link})
except requests.ConnectionError as e:
print(f"Connection Error {e}")
except Exception as e:
print(e)

return None


def get_download_link(url):
link = None
try:
soup = BeautifulSoup(
requests.get(
f"{BASE_URL}{url}",
timeout=10,
headers={"user-agent": "Mozilla/5.0"},
verify=True,
).text,
"html.parser",
)
div = soup.select(".detailContainer")[0]
return div.select("a")[0]["href"]
except requests.ConnectionError as e:
print(f"Connection Error {e}")
except Exception as e:
print(e)

return None
return links


def get_personas_juridicas_link():
link = None
try:
soup = BeautifulSoup(
requests.get(
Expand All @@ -87,48 +57,54 @@ def get_personas_juridicas_link():
).text,
"html.parser",
)
list_div = soup.select(".media-body")[0]
download_page_url = list_div.select("a")[0]["href"]

soup = BeautifulSoup(
requests.get(
f"{BASE_URL}{download_page_url}",
timeout=10,
headers={"user-agent": "Mozilla/5.0"},
verify=True,
).text,
"html.parser",
)
return soup.select(".btn-primary")[0]["href"]
items = soup.find_all("div", class_="list__item search-item")
for item in items:
if item.attrs["data-value"] == "2022":
continue
link = item.find("a", class_="link", attrs={"download": ""})["href"]
except requests.ConnectionError as e:
print(f"Connection Error {e}")
except Exception as e:
print(e)

return None
return link


def extract_pj():
"""
Column Index:
0 = RUC
1 = DV
2 = NOMBRE_RAZON_SOCIAL
3 = CATEGORIA
"""

def _categoria(value):
return "P" if value == "PEQUENO" else "M" if value == "MEDIANO" else "G"

os.system("cd tmp && unrar e juridicas.rar -y")
def _category(value):
if value == "PEQUENO":
return "P"
elif value == "MEDIANO":
return "M"
elif value == "GRANDE":
return "G"
else:
return "D"

path = os.getcwd()
path = os.path.join(path, "tmp")
csv_files = glob.glob(os.path.join(path, "*.xlsx"))
csv_files = glob.glob(os.path.join(path, "*.pdf"))

for f in csv_files:
df = pd.read_excel(f, header=None)
for index, row in df.iterrows():
if row[0] != "NaN":
pj[f"{row[0]}-{row[1]}"] = _categoria(row[3])
dfs = tabula.read_pdf(f, pages="all", multiple_tables=True)
for df in dfs:
for index, row in df.iterrows():
try:
ruc = row[0]
dv = row[1]
category = row[3]
except IndexError:
ruc = row[0]
dv = row[1]
if row[2].endswith("PEQUENO"):
category = "PEQUENO"
elif row[2].endswith("MEDIANO"):
category = "MEDIANO"
elif row[2].endswith("GRANDE"):
category = "GRANDE"
else:
category = "DESCONOCIDO"
pj[f"{ruc}-{dv}"] = _category(category)


def download(url, filename):
Expand Down Expand Up @@ -159,16 +135,6 @@ def create_values(filename):
values_list.append((ruc, rz, tipo, cat, dv, status))


def build_database():
with open("ruc.sql", "w", encoding="utf8") as f:
f.write(create_table_postgresql())
f.write(create_view_postgresql())

chunks = [values[x : x + 1000] for x in range(0, len(values), 1000)]
for v in chunks:
f.write("\n" + insert_postgresql(",\n".join(v)))


def build_sqlite3():
con = sqlite3.connect("ruc.db")
cur = con.cursor()
Expand All @@ -183,33 +149,37 @@ def build_sqlite3():
con.close()


def zip_database():
file_compress(["ruc.sql"], "../dist/ruc.zip")
def update_counter():
con = sqlite3.connect("ruc.db")
cur = con.cursor()
cur.execute("SELECT count(*) as count FROM ruc")
data = cur.fetchone()
if data:
with open("../counter.txt", "w") as f:
f.write(str(data[0]))


if __name__ == "__main__":
print("getting links")
links = get_download_preview_links()
links = get_ruc_download_links()
# pj
print("downloading juridicas.rar")
download(get_personas_juridicas_link(), "juridicas.rar")
print("extracting juridicas.rar")
print("downloading listado de personas juridicas")
download(get_personas_juridicas_link(), "personas-juridicas.pdf")
print("extracting listado de personas juridicas")
extract_pj()

for d in links:
name = list(d.keys())[0]
url = d[name]
print(f"downloading {name}")
dl = get_download_link(url)
filename = download(dl, name)
filename = download(url, name)
try:
extract(filename)
finally:
print(f"{filename} extracted")
print("building values")
create_values(f"tmp/{name.split('.')[0]}.txt")
print("building database")
build_database()
print("building sqlite database")
build_sqlite3()
zip_database()
print("updating counter")
update_counter()
Binary file not shown.
43 changes: 4 additions & 39 deletions data/utils.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,11 @@
import zipfile


def create_table_postgresql():
return """
CREATE TABLE contribuyentes
(
ruc character varying(15) NOT NULL,
razon_social character varying(300) NOT NULL,
dv character varying(1) NOT NULL,
ruc_str character varying(15) NOT NULL,
CONSTRAINT pk_contribuyentes PRIMARY KEY (ruc )
);
CREATE INDEX ruc_index
ON contribuyentes
USING btree
(ruc COLLATE pg_catalog."default" );
"""


def insert_postgresql(values):
return f"""INSERT INTO contribuyentes(ruc, razon_social, dv, ruc_str)
VALUES
{values};"""

def insert_values(ruc, razon_social, dv, ruc_str):
razon_social = razon_social.replace("'", "''")
ruc_str = ruc_str.replace("'", "''").replace(chr(92), '')
ruc_str = ruc_str.replace("'", "''").replace(chr(92), "")
return f"""('{ruc}', '{razon_social}', '{dv}', '{ruc_str}')"""

def create_view_postgresql():
return """
CREATE OR REPLACE VIEW ruc_contribuyentes AS
SELECT pg_catalog.concat(contribuyentes.ruc, '-', contribuyentes.dv) AS ruc,
CASE
WHEN substr(contribuyentes.ruc::text, 1, 1) <> '8'::text THEN substr(pg_catalog.concat(split_part(contribuyentes.razon_social::text, ','::text, 2), ' ', split_part(contribuyentes.razon_social::text, ','::text, 1)), 2, length(pg_catalog.concat(split_part(contribuyentes.razon_social::text, ','::text, 2), ' ', split_part(contribuyentes.razon_social::text, ','::text, 1))))::character varying
ELSE contribuyentes.razon_social
END AS razon_social
FROM contribuyentes;
"""

def file_compress(inp_file_names, out_zip_file):
"""
Expand All @@ -56,17 +21,17 @@ def file_compress(inp_file_names, out_zip_file):
print(f" *** Input File name passed for zipping - {inp_file_names}")

# create the zip file first parameter path/name, second mode
print(f' *** out_zip_file is - {out_zip_file}')
print(f" *** out_zip_file is - {out_zip_file}")
zf = zipfile.ZipFile(out_zip_file, mode="w")

try:
for file_to_write in inp_file_names:
# Add file to the zip file
# first parameter file to zip, second filename in zip
print(f' *** Processing file {file_to_write}')
print(f" *** Processing file {file_to_write}")
zf.write(file_to_write, file_to_write, compress_type=compression)
except FileNotFoundError as e:
print(f' *** Exception occurred during zip process - {e}')
print(f" *** Exception occurred during zip process - {e}")
finally:
# Don't forget to close the file!
zf.close()
Empty file removed dist/dummy.txt
Empty file.
Binary file removed dist/ruc.zip
Binary file not shown.
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ openpyxl==3.0.9
pycryptodome==3.14.1
python-multipart
pip-tools
tabula
10 changes: 7 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#
# This file is autogenerated by pip-compile with python 3.10
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile
# pip-compile requirements.in
#
aiosqlite==0.17.0
# via -r requirements.in
Expand All @@ -29,7 +29,9 @@ h11==0.13.0
idna==2.10
# via requests
numpy==1.22.0
# via pandas
# via
# pandas
# tabula
openpyxl==3.0.9
# via -r requirements.in
pandas==1.3.5
Expand Down Expand Up @@ -62,6 +64,8 @@ sqlalchemy==1.3.24
# databases
starlette==0.14.2
# via fastapi
tabula==1.0.5
# via -r requirements.in
tomli==2.0.1
# via pep517
typing-extensions==4.1.1
Expand Down

0 comments on commit 88a723d

Please sign in to comment.