-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLineBlog_img_crawler.py
141 lines (95 loc) · 4.09 KB
/
LineBlog_img_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/python3
from urllib.request import urlopen as uReq
from urllib.request import urlretrieve as uRetr
from bs4 import BeautifulSoup as soup
import os
raw_url = input("\nPlease Input the URL: (e.g. https://lineblog.me/uesaka_sumire/archives/2018-11.html)\nURL: ")
folder_dirname = input("\nPlease Input the Saving Path: (e.g. ~/[YOUR_DIRNAME])\nPath: ")
dirname_without_title = input("\nWould yout like to have directory names without artile title? (Y/N) ")
crawler_mode = input("\nPlease Choose Mode:\n 0 --- Current Page's Lateset Article ONLY\n 1 --- Current Page ONLY\n 2 --- ALL Related Pages\n 3 --- Current Page at Specific Position\nMode: ")
def open_url_to_soup(url):
# opening up connection, grabbing the page
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
return page_soup
def find_target_urls(raw_url):
page_soup = open_url_to_soup(raw_url)
target_urls = []
last_page_url = []
last_page_num = None
paging_last = page_soup.find("li", {"class":"paging-last"})
if paging_last != None:
for page in paging_last:
last_page_url = page.get("href")
last_page_num = page.string
# print("Last URL:", last_page_url)
# print("Last Page Num:", last_page_num)
if last_page_num != None:
base_url = last_page_url.split("?p=")[0] + "?p="
for i in range(1, int(last_page_num) + 1):
concat_url = base_url + str(i)
target_urls.append(concat_url)
# print("Concat URL:", concat_url)
else:
paging_number = page_soup.find("ol", {"class":"paging-number"})
target_pages = paging_number.find_all("a")
target_page_num = 1
for target_page in target_pages:
target_urls.append(target_page.get("href"))
if target_page_num < int(target_page.string):
target_page_num = int(target_page.string)
return target_urls
def mkdir(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
print("--- Made New Dir ---")
else:
print("--- Alredy Exsits ---")
def downloadImg(imgURLs_article, article_name):
imgID = 0
for imgURL_article in imgURLs_article:
imgURL = imgURL_article.get('href')
if dirname_without_title == 'N' or dirname_without_title == 'n':
folder_basename = article_name.split("\n")[2] + " " + article_name.split("\n")[1]
else:
folder_basename = article_name.split("\n")[2]
folder_basename = folder_basename.replace('/', '-')
folder_basename = folder_basename.replace(':', '-')
folder_basename = folder_basename.replace('`', '-')
folder_path = os.path.join(os.path.expanduser(folder_dirname), folder_basename)
mkdir(folder_path)
uRetr(imgURL, folder_path + "/%03d.jpg" %imgID)
print(imgURL)
imgID = imgID + 1
def mono_article_parse(article):
title_article = article.find("header", {"class":"article-header"})
article_name = title_article.text
imgURLs_article = article.find("div", {"class":"article-body-inner"}).find_all("a", {"target":"_blank"})
downloadImg(imgURLs_article, article_name)
def whole_parse_and_download(target_url):
page_soup = open_url_to_soup(target_url)
# grabs each element
articles = page_soup.find_all("article", {"class":"article"})
for article in articles:
mono_article_parse(article)
def specific_parse_and_download(target_url, article_position = 0):
page_soup = open_url_to_soup(target_url)
# grabs each element
articles = page_soup.find_all("article", {"class":"article"})
mono_article_parse(articles[article_position])
if crawler_mode == '0':
specific_parse_and_download(raw_url)
elif crawler_mode == '1':
whole_parse_and_download(raw_url)
elif crawler_mode == '2':
# whole_parse_and_download(raw_url)
target_urls = find_target_urls(raw_url)
for target_url in target_urls:
whole_parse_and_download(target_url)
elif crawler_mode == '3':
article_position = input("\nPlease Choose the Article Position (Start with 1): ")
specific_parse_and_download(raw_url, int(article_position) - 1)