-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia_scrape.py
78 lines (63 loc) · 2.33 KB
/
wikipedia_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# https://www.guru99.com/reading-and-writing-files-in-python.html
# https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
# https://towardsdatascience.com/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460
# https://docs.python.org/3/howto/regex.html
# https://stackoverflow.com/questions/2136267/beautiful-soup-and-extracting-a-div-and-its-contents-by-id#2136323
# Writen by Jacob Hillock / jmanh128 on github. Using above for help to write
import requests
import urllib.request
import time
import re
import os
from pathlib import Path
from bs4 import BeautifulSoup
from io import StringIO
from html.parser import HTMLParser
from cross_platform_support import dir_separater
import argparse
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def scrape(url):
s = dir_separater()
time.sleep(.05)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = strip_tags(str(soup.find("h1", {"id": "firstHeading"})))
file_name = title.replace(" ", "_").replace("/", "_") + ".txt"
if not Path(f'scrapes{s}').is_dir():
os.makedirs(f'scrapes{s}')
if not Path(f"scrapes{s}{file_name}").is_file():
paras = soup.findAll('p')[1:]
for i in range(len(paras)):
paras[i] = strip_tags(str(paras[i]))
with open(f"scrapes{s}" + file_name, "w+") as file:
data = f""
for p in paras:
data += p
regex = re.compile("\[\d*\]")
data = regex.sub("", data)
file.write(data)
print('File writen')
return file_name
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--article', default='Artificial_Intelligence',
help='Keyword to scrape [default: Aritificial Intelligence]')
FLAGS = parser.parse_args()
# lets capture the keyword to scrape from wikipedia
article = FLAGS.article
url = "https://en.wikipedia.org/wiki/"+article
scrape(url)