-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinsideofknoxville.py
91 lines (77 loc) · 4.19 KB
/
insideofknoxville.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#insideofknoxville.com
import requests
import dateparser
from bs4 import BeautifulSoup
from newspaper import Article
def getArticles(newspaper, startDate, endDate):
startYear = startDate.year
startMonth = startDate.month
totalMonths = (endDate.year - startDate.year) * 12 + endDate.month - startDate.month + 1 #total months within date range
count = 0 #used to increment total months
counter = 0 #used to increment month called in url
pageNumber = 1
print("Total Months: ", totalMonths)
#loop for number of months within date range
while(count < totalMonths):
if((startMonth + counter) > 12): #keep month counter 1-12
startMonth = 1
counter = 0
startYear = startYear + 1
URL = f"https://insideofknoxville.com/{startYear}/{startMonth + counter}/page/{pageNumber}/"
print(URL)
#beautifulsoup
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find("div", id="main")
articles = results.find_all("article")
while True:
#identifying whether there's a second page of articles
pageSelector = soup.find("div", class_="pagination tipi-col tipi-xs-12 font-2")
nextPage = pageSelector.find("a", class_="next page-numbers")
#grabbing all articles
for article in articles:
nURL = article.find("a").get("href")
nPage = requests.get(nURL)
nSoup = BeautifulSoup(nPage.content, "html.parser")
nResults = nSoup.find("article")
title = nResults.find("h1", class_="title")
if title:
title = title.text.strip()
author = nResults.find("span", class_="author")
if author:
author = author.text.strip()
date = nResults.find("span", class_="date")
if date:
date = date.text.strip()
parsedDate = dateparser.parse(date, settings={'DATE_ORDER': 'YMD'}).date()
if parsedDate > endDate or parsedDate < startDate: #only grabbing articles within date range
print("Article out of date range: ", parsedDate)
continue
print("Got Article: ", parsedDate)
articleBody = nResults.find("div", class_="entry-content-wrap")
content = articleBody.find_all(["p", "figure", "ul"]) #will only grab text, images, and list items
articleData = []
for item in content:
if str(item).startswith("<figure class=\"gallery-item\""): #excluding gallery item images
continue
elif str(item).startswith("<figure"): #appending images and captions
articleData.append(str(item))
elif str(item).startswith("<ul>"): #appending list items
articleData.append(str(item))
else:
articleData.append(f'<p>{item.text}</p>') #appending paragraphs
newArticle = Article(parsedDate, title, nURL, author, articleData)
newspaper.add_article(newArticle)
if nextPage == None: #if no next page, break the loop
break
#incrementing the month in the URL and rerunning
pageNumber = pageNumber + 1
URL = f"https://insideofknoxville.com/{startYear}/{startMonth + counter}/page/{pageNumber}/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find("div", id="main")
articles = results.find_all("article")
pageNumber = 1
count = count + 1
counter = counter + 1
return newspaper