-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.py
36 lines (26 loc) · 1.09 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import unicodedata
website_url = "https://baahrakhari.com/news-details/169935/2018-12-19"
DEFAULT_ENCODING = "utf-8"
# Removes spaces between lines and text other that devnagari
# It checks only the first letter of the sentence if it is devnagari, this can be fixed
def removeUnwatedData(text):
return text.strip() != "" and 'DEVANAGARI' in unicodedata.name(unicode(text.strip(), 'utf-8')[0])
# Removes spaces on front and back of the text on each line
def removeExtraSpace(text):
return text.strip()
# Make request to the website
site_resp = requests.get(website_url)
# Change encoding of website to UTF-8 ( Life saving trick )
site_resp.encoding = DEFAULT_ENCODING
html_doc = site_resp.text
soup = BeautifulSoup(html_doc, 'html.parser')
body_text = soup.body.get_text().encode(DEFAULT_ENCODING)
text_list = map(removeExtraSpace, filter(removeUnwatedData, body_text.splitlines()))
# Joins items of the list with a line break
final_text = '\n'.join(text_list)
file = open('news.txt', 'w')
file.write(final_text)
file.close()