-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscholar_scrape.py
55 lines (40 loc) · 1.51 KB
/
scholar_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from bs4 import BeautifulSoup
import requests
import json
import pprint
def create_list():
url = input("Type in the url for a google scholar profile: ")
# get html for a person's profile on google scholar
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
# store all of their publications in an array
publications = soup.find_all('tr', class_='gsc_a_tr')
# loop through publications
# We'll use this to pool together the publications and make the json
data = []
for publication in publications:
curr_pub = {}
# access publication title
title = publication.td.a.text
curr_pub["title"] = title
divs = publication.find_all('div', class_='gs_gray')
# access publication authors (i should probably store each author individually but i haven't tried to yet)
authors = divs[0].text
# get publication citation info
citation = divs[1].text
curr_pub["authors"] = authors
curr_pub["citation"] = citation
year = publication.find('td', class_='gsc_a_y').text
curr_pub["year"] = year
link = 'https://scholar.google.com' + str(publication.td.a['href'])
curr_pub["link"] = link
data.append(curr_pub)
data_list = json.dumps(data)
return data_list
# pprint.pprint(data_list)
def create_json():
data_list = create_list()
file_path = input("Type in a file path: ")
with open(file_path, 'w') as f:
f.write(data_list)
create_json()