-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscrap_cars.py
executable file
·83 lines (63 loc) · 2.29 KB
/
scrap_cars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests, bs4
import concurrent.futures
import time
import get_advertisement
import pandas as pd
import os
MAX_THREADS = 8
path = os.path.join(os.getcwd(), 'car_models.txt')
models = open(path, 'r', encoding='utf-8').readlines()
links = []
def get_cars_in_page(path, i):
print(i, path)
res = requests.get(path + '?page=' + str(i))
res.raise_for_status()
currentPage = bs4.BeautifulSoup(res.text, features='lxml')
carlinks = currentPage.find('main', attrs={'data-testid': 'search-results'})
cnt = 0
for x in carlinks.find_all('article'): # [:10]: TODO
x = x.find('a', href=True)
links.append(x['href'])
# print(x['href'])
def scrap_model(model):
model = model.replace('\n', '')
path = 'https://www.otomoto.pl/osobowe/' + model
print(path)
try:
res = requests.get(path)
res.raise_for_status()
carSoup = bs4.BeautifulSoup(res.text, features="lxml")
except Exception:
pass
try:
lastPage = int(carSoup.find_all('li', attrs={'data-testid': 'pagination-list-item'})[-1].text)
# lastPage = int(carSoup.find_all('a', class_='ooa-xdlax9 ekxs86z0')[-1].text)
except Exception:
lastPage = 1
#lastPage = 1 TODO
lastPage = min(lastPage, 500)
print("Liczba podstron modelu = ", lastPage)
threads = min(MAX_THREADS, lastPage)
path = [path]*lastPage
lastPage = range(1, lastPage + 1)
links.clear()
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(get_cars_in_page, path, lastPage)
get_advertisement.main(model, links)
time.sleep(0.25)
for model in models:
scrap_model(model)
# csv_filenames = ['data/'+ model.replace('\n', '') + '.csv' for model in models]
xlsx_filenames = ['data/'+ model.replace('\n', '') + '.xlsx' for model in models]
combined_df = []
for f in xlsx_filenames:
print(f)
try:
# combined_df.append(pd.read_csv(f, low_memory=False, index_col='Unnamed: 0'))
combined_df.append(pd.read_excel(f, index_col='Unnamed: 0'))
except Exception:
pass
df_all = pd.concat(combined_df, ignore_index=True)
df_all.to_excel('car.xlsx', index=False)
# df_all.to_csv('car.csv', index=False)
# pd.concat(combined_csv, ignore_index = True).to_pickle('data/car.pickle')