-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_mars.py
107 lines (80 loc) · 3.04 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import requests
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
def init_browser():
executable_path = {'executable_path': ChromeDriverManager().install()}
return Browser('chrome', **executable_path, headless=False)
def scrape():
browser = init_browser()
### VISIT MARS NEWS ###
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
time.sleep(1)
# Scrape page into Soup
html = browser.html
soup = BeautifulSoup(html, "html.parser")
# Get the first news article
news_title = soup.find_all('div', class_='content_title')[1].text
news_p = soup.find('div', class_='article_teaser_body').text
### VISIT MARS SPACE IMAGES ###
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
# Navigate to featured image page
browser.links.find_by_partial_text('FULL IMAGE').click()
browser.links.find_by_partial_text('more info').click()
# HTML Object
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# Retrieve main image url address
feat_img = soup.find('img', class_='main_image')['src']
### VISIT MARS FACTS PAGE ###
url = 'https://space-facts.com/mars/'
# Use Panda's 'read_html' to parse the url
tables = pd.read_html(url)
# Find the Mars Facts table in the list and assign it to 'mars_df'
mars_df = tables[0]
mars_df.columns = ['Description', 'Mars']
mars_df.set_index('Description', inplace=True)
# Convert dataframe to HTML
mars_facts_table = mars_df.to_html()
### VISIT MARS HEMISPHERES PAGE ###
# Create list to store dictionaries
hemisphere_image_urls = []
# Iterate through all images
for x in range(4):
# URL of page to be scraped
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
# Retrieve page with the requests module
browser.visit(url)
# Navigate to featured image page
browser.links.find_by_partial_text('Hemisphere Enhanced')[x].click()
# HTML Object
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# Scrape image URL
hemisphere_img_url = soup.find('img', class_='wide-image')['src']
hemisphere_title = soup.find('h2', class_='title').text
# Create dictionary
hemisphere_dictionary = {
'title': hemisphere_title,
'img_url': hemisphere_img_url
}
# Append list with dictionary
hemisphere_image_urls.append(hemisphere_dictionary)
# Store all data in the dictionary
mars_dict = {
"news_title": news_title,
"news_p": news_p,
"feat_img": feat_img,
"mars_facts_table": mars_facts_table,
"hemisphere_image_urls": hemisphere_image_urls
}
# Close the browser after scraping
browser.quit()
# Return results
return mars_dict