-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathWeWork Address Scraper.py
89 lines (74 loc) · 3.26 KB
/
WeWork Address Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# coding: utf-8
# Importing Packages
import pycurl
from StringIO import StringIO
import json
import csv
import pandas as pd
from tqdm import tqdm
from pandas.io.json import json_normalize
import requests
from bs4 import BeautifulSoup
import time
# Getting the HTML from each WeWork Location
url = 'https://www.wework.com/locations'
response = requests.get(url, params={"search_api_views_fulltext": ""})
soup = BeautifulSoup(response.text, "lxml")
# note: if this link doesn't work, find the new tag by inspecting one of the city links
locations = soup.find_all("a", {"class":"sl_norewrite marketLink__countryList__F4CBD baseLink__countryList__22den weLink-sc-5sbo5g-0 kbLrON"})
# Getting Clean Location Link Stubs
link_stub_list = []
for loc in range(0,len(locations)):
selector = locations[loc]
link_stub = selector['href']
link_stub_list.append(link_stub)
coworking_spaces = []
# Iterating Over Each location
for city in tqdm(link_stub_list):
delay = 0
# Trying again if call fails, and therefore space is empty
space = []
while len(space) < 1 :
# Creating dynamic link based on link stub
url = "https://www.wework.com" + city
response = requests.get(url, params={"search_api_views_fulltext": ""})
soup = BeautifulSoup(response.text, "lxml")
# Test to see if location is coming soon
coming_soon = soup.find_all("div", {"class":"ray-tag coming-soon_tag"})
# Break statement to continue on if the location is coming soon
if len(coming_soon) > 1:
print(city + " Coming Soon, Skipped")
break
# Getting info from each location (if it doesn't work, inspect inspect the coworking space address update)
space = soup.find_all("div", {"class":"ray-text--body building-card__address"})
# Adding a delay to avoid timeouts ; this keeps on growing each time the call fails
delay = delay +.1
time.sleep(delay)
# Appending messy location data to a list of jsons
coworking_spaces.append(space)
# Flattening out the nested list
flat_list = [item for sublist in coworking_spaces for item in sublist]
# Creating list to put cleaned strings in
cleaned_full_address_list = []
cleaned_state_list = []
cleaned_zip_list = []
# Cleaning up addresses to extract the full address, state, and zip
for item in range(0,len(flat_list)):
messy_address = flat_list[item]
cleaner_address = str(messy_address).split(">")[1]
cleaned_address = cleaner_address.split("<")[0]
state = cleaned_address.split(" ")[-2]
zipcode = cleaned_address.split(" ")[-1]
# Trying to avoid adding international locations; this FAILS is a few locations based on string patters that look like USA
if len(zipcode)==5 and len(state)== 2:
cleaned_state_list.append(state)
cleaned_zip_list.append(zipcode)
else:
cleaned_state_list.append("")
cleaned_zip_list.append("")
# Appending the full address to the list
cleaned_full_address_list.append(cleaned_address)
# Converting Lists into a dataframe
address_df = pd.DataFrame(list(zip(cleaned_full_address_list, cleaned_state_list, cleaned_zip_list)), columns =['Full Address', 'State', 'Zip'])
# Writing CSV to local directory
address_df.to_csv("WeWork Addresses.csv", encoding = 'utf-8')