-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbronxschoolstaff.py
51 lines (40 loc) · 1.35 KB
/
bronxschoolstaff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Bronx School Staff Webscrapping
import requests
p=requests.get('https://www.newvisions.org/ams2/pages/our-staff2')
#Import webpage into BeautifulSoup and parsing it
from bs4 import BeautifulSoup
soup=BeautifulSoup(p.text, 'html.parser')
#Create set based on HTML tags with desired data
results=soup.find_all('div', attrs={'class':'matrix-content'})
len(results)
results=results[27:]
len(results)
#Testing with the first teacher and obtaining the name
test_result=results[0]
test_result.find('h5')
test_result.find('h5').text
#Obtaining position(s)
test_result.find('p').text.strip('\n\t')
#Obtaining email
test_result.find('em').get_text()
#Data extraction
info=[]
for result in results:
name=result.find('h5').text
position=result.find('p').text.strip('\n\t')
try:
email=result.find('em').get_text()
except:
email='NaN'
info.append((name,position,email))
#Convert to dataframe and export to csv
import pandas as pd
df=pd.DataFrame(info, columns=['Name','Position(s)','Email'])
#Determining duplicates
for column in df.columns:
print(df.duplicated([column]))
print(df.duplicated([column]).sum())
#Eliminating duplicates
df.drop_duplicates(['Name'],keep='first', inplace=True)
#Export to csv without numbered indices
df.to_csv('BronxSchoolStaffInfo.csv', index=False)