Skip to content

Commit

Permalink
[main] exercise.py
Browse files Browse the repository at this point in the history
  • Loading branch information
yining-w authored Nov 5, 2023
0 parents commit d5588ea
Showing 1 changed file with 195 additions and 0 deletions.
195 changes: 195 additions & 0 deletions exercise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
#import pdfplumber
import fitz # PyMuPDF

os.chdir("C:/Users/yinin/Desktop/fadzil")
# main pdf download link
path = os.getcwd()

# ----- Set up: Get a list of links ------ #
# ----- we need to do it in two iterations due to "active card" settings
# Define the URL
url = 'https://data.ed.gov/dataset/idea-section-618-data-products-data-displays-part-b-2022/resources'

# Get the HTML content of the page
response = requests.get(url)
html_content = response.text

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all the dataset cards
dataset_cards = soup.find_all('div', class_="inner-sidebar")

# Extract all links
links_headers = []
for card in dataset_cards:
# Find all 'a' tags within each card
a_tags = card.find_all('a', href=True)
for a_tag in a_tags:
links_headers.append((a_tag['href']))

# Save the information to an Excel file
df = pd.DataFrame(links_headers, columns=['Link'])
df['Link'] = 'https://data.ed.gov' + df['Link'].astype(str)

# Remove the link with the active
df = df.drop([1])
df = df.reset_index()

# Initialize an empty dataframe for the results
results_df = pd.DataFrame(columns=['Href', 'H5'])

# Loop through each row in the dataframe
for index, row in df.iterrows():
# Replace the 'url' with the link from the dataframe
url = row['Link']

# Get the HTML content of the page
response = requests.get(url)
html_content = response.text

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find the href within <a class="usa-button download-btn resource-type-None resource-url-analytics">
a_tag = soup.find('a', class_="usa-button download-btn resource-type-None resource-url-analytics")
href = a_tag['href'] if a_tag else 'No Link Found'

# Find the <h5> element in the <div class="dataset-card active">
dataset_card = soup.find('div', class_="dataset-card active")
h5_tag = dataset_card.find('h5') if dataset_card else None
h5_text = h5_tag.get_text(strip=True) if h5_tag else 'No Header Found'

# Append the findings to the results dataframe
results_df = results_df.append({'Href': href, 'H5': h5_text}, ignore_index=True)

# remove the unecessary parts of the string
cleaned_df = results_df
cleaned_df['H5'] = results_df['H5'].str[:-25]

# Save the excel link
df.to_excel('links_and_headers.xlsx', index=False)

# ----------- Step 2: Now, download each pdf ------------ #
for index, row in cleaned_df.iterrows():
# Get the PDF url
pdf_url = row['Href']

# Make the request to get the PDF content
response = requests.get(pdf_url)

# Check if the request was successful
if response.status_code == 200:
# You can name the file using the index or another unique identifier
# Here, I'm simply using the index to name the PDF files
desired_name = row['H5']
filename = desired_name + ".pdf"


# Write the PDF content to a file in the working directory
with open(filename, 'wb') as f:
f.write(response.content)
print(f"Downloaded and saved {filename}")
else:
print(f"Failed to retrieve PDF from {pdf_url}")


## ----------- Step 3: Read the actual table from the pdf ---------#
## Convert to csv
files = os.listdir(path)
files = [x for x in files if '.pdf' in x]

# Empty list of dataframes to append to
dfs =[]
file = [x for x in files if '.pdf' in x]

# Use this for testing
#file = ['Wyoming.pdf']

# Function to parse the text block and extract rows as lists
def extract_rows_from_block(block):
rows = []
for line in block['lines']:
# Assuming each 'line' corresponds to a row in the table
row = [span['text'] for span in line['spans']]
rows.append(row)
return rows

# Loop through each file
for pdf_file in file:
print(pdf_file)

# Some pdfs are not reading the data, will need to check this
try:
# Open the pdf
with fitz.open(pdf_file) as doc:
# Access the second page of the PDF
page = doc[1] # Pages are zero-indexed
# Look for the table title and extract text from there
found_table = False
table_blocks = []

# Iterate over the text "blocks" on the page
for block in page.get_text("dict")["blocks"]:
# Check if this block contains the table title
if any("PERCENT OF CHILDREN" in span['text'] for span in block.get("lines", [])[0].get("spans", [])):
found_table = True
if found_table and 'lines' in block: # Assuming that the table starts after the title
table_blocks.append(block)

# Extract rows from blocks assumed to be table data
table_rows = []
for block in table_blocks:
table_rows.extend(extract_rows_from_block(block))

# The first row would typically be the headers, the rest are data
headers = table_rows.pop(0)
df = pd.DataFrame(table_rows, columns=headers)

# Combine rows by concatenating the text values
combine_indices = [(1, 2), (3, 4), (29, 30), (33, 34)]
for index_pair in combine_indices:
# Concatenate the two rows and assign back to the first row index
# Assuming the first column is at index 0
df.iloc[index_pair[0], 0] = df.iloc[index_pair[0], 0] + " " + df.iloc[index_pair[1], 0]

# Drop the second row of each pair since it's now combined with the first
rows_to_drop = [index for pair in combine_indices for index in pair[1:]]
df.drop(rows_to_drop, inplace=True)

# Reset the index if needed
df.reset_index(drop=True, inplace=True)

# Drop the range of rows from 43 to 55 (adjusting for zero-based index)
df.drop(df.index[39:51], inplace=True)

# Assuming `df` is your DataFrame with a single column named 'Data'
# Convert the column to a list, skipping the first three values (header row)
data_values = df.iloc[3:, 0].tolist()

# Determine how many full rows of data you have (each row has 3 values)
num_rows = len(data_values) // 3

# Reshape the data into a new DataFrame with 3 columns
reshaped_data = [data_values[i:i + 3] for i in range(0, num_rows * 3, 3)]
new_df = pd.DataFrame(reshaped_data, columns=df.iloc[0:3, 0].tolist())


new_df['state'] = pdf_file

dfs.append(new_df)

# Skip the pdf if table not found
except Exception:
print("Skipped ",pdf_file)
continue # Skip to the next row

# Concatenate all dataframes into one
final_df = pd.concat(dfs, ignore_index=True)

# Save the cleaned thing
final_df.to_excel('final_df.xlsx', index=False)

0 comments on commit d5588ea

Please sign in to comment.