-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcmp_scrape.py
102 lines (87 loc) · 3.02 KB
/
cmp_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import time
numDataError = 0
#Update variable to match total number of pages found on coinmarketcap
numPages = 84
#objectes for pandas
market_cap = []
volume = []
name = []
symbol = []
slug = []
price = []
volcap = []
cirsupply = []
maxsupply = []
totsupply = []
pageNum = []
OneDayChange = []
df = pd.DataFrame(columns = ['PageNum', 'slug', 'name', 'symbol','price', 'MarketCap', 'volume','Volume / MarketCap','Circulating Supply','Max Supply','Total Supply', '24hr Change'])
#Load coins
for x in range(1, numPages+1):
print(f'Gathering Coin data from page {x} of {numPages}')
cmc = requests.get(f'https://coinmarketcap.com/?page={x}')
time.sleep(0.5)
soup = BeautifulSoup(cmc.content, 'html.parser')
data = soup.find('script', id="__NEXT_DATA__",type="application/json")
coin_data = json.loads(data.contents[0])
listings = coin_data['props']['initialState']['cryptocurrency']['listingLatest']['data']
#make sure the page number and the data are equal
if x == int(coin_data['props']['initialState']['cryptocurrency']['listingLatest']['page']):
for i in listings:
#first need to get the column indexes of the data we want to pull back
if type(i) == dict:
varName = i['keysArr'].index('name')
varMarketCap = i['keysArr'].index('quote.USD.marketCap')
var24hrVolume = i['keysArr'].index('quote.USD.volume24h')
varSlug = i['keysArr'].index('slug')
varSymbol = i['keysArr'].index('symbol')
var24hrChange = i['keysArr'].index('quote.USD.percentChange24h')
#not sure why, but on a few pages they exclude maxsupply
try:
varMaxSupply = i['keysArr'].index('maxSupply')
except ValueError:
varMaxSupply = -1
varTotalSupply = i['keysArr'].index('totalSupply')
varCirSupply = i['keysArr'].index('circulatingSupply')
varPrice = i['keysArr'].index('quote.USD.price')
if type(i) == list:
if int(i[varMarketCap]) > 0: #this will export all coins with a marketcap greater than zero
pageNum.append(x)
market_cap.append(i[varMarketCap])
volume.append(i[var24hrVolume])
slug.append(i[varSlug])
name.append(i[varName])
symbol.append(i[varSymbol])
price.append(i[varPrice])
OneDayChange.append(i[var24hrChange])
try:
volcap.append(i[var24hrVolume] / i[varMarketCap])
except ZeroDivisionError:
volcap.append(0)
cirsupply.append(i[varCirSupply])
totsupply.append(i[varTotalSupply])
if varMaxSupply > 0:
maxsupply.append(i[varMaxSupply])
else:
maxsupply.append(0)
else:
print(f'Data not equal, skipping page {x}')
numDataError += 1
print(f'Total Pages skipped: {numDataError}')
df['PageNum'] = pageNum
df['slug'] = slug
df['name'] = name
df['symbol'] = symbol
df['price'] = price
df['MarketCap'] = market_cap
df['volume'] = volume
df['Volume / MarketCap'] = volcap
df['Circulating Supply'] = cirsupply
df['Max Supply'] = maxsupply
df['Total Supply'] = totsupply
df['24hr Change'] = OneDayChange
df.to_csv('cmp_out.csv',index = False)