-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser.py
80 lines (71 loc) · 2.27 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gutenberg
import bs4
from bs4 import BeautifulSoup
from operator import itemgetter
def lines_between(cur, end):
while cur and cur not in end:
if isinstance(cur, bs4.NavigableString) and cur.parent.name != 'a':
text = str(cur)
# text = text.strip()
if cur.parent.name not in ('i', 'b', 'strong') and cur.next_sibling is None:
text = text.strip() + '\n\n'
else:
text = text.strip() + ' '
if len(text):
yield text
elif cur.name == 'br':
yield '\n\n'
cur = cur.next_element
def parse_toc1(text):
soup = BeautifulSoup(text, 'html.parser')
table = soup.find('table')
toc = []
for tr in table.find_all('tr'):
cells = [td for td in tr.find_all('td')]
if len(cells) < 3: continue
toc.append((cells[0].text, cells[1].text, int(cells[2].text), cells[2].find('a').attrs['href'] ))
toc = [(text, author, href) for text, author, _, href in sorted(toc, key=itemgetter(2))]
return toc
def parse_toc2(text, titles):
toc = []
soup = BeautifulSoup(text, 'html.parser')
for a in soup.find_all('a', href=True):
for text, author in titles:
if a.text.strip().lower().startswith(text.lower()):
toc.append((text, author, a.attrs['href']))
break
return toc
def parse_toc3(text, titles):
toc = []
soup = BeautifulSoup(text, 'html.parser')
for text, author in titles:
for a in soup.find_all('a', href=True):
if a.text.strip().lower().startswith(text.lower()):
toc.append((text, author, a.attrs['href']))
break
return toc
def parse_text(text, titles):
lines = gutenberg.strip_headers(text).strip().splitlines()
i = -1
for index, line in enumerate(lines):
if line.strip() == 'INDEX':
i = index
lines = lines[:i]
stories = {}
key = None
authors = [author for _, author in titles]
for line in lines:
if line:
if line in authors:
continue
for title, author in titles:
cond1 = line.replace('_', ' ').replace('[1]', '').rstrip('*').strip() == title
cond2 = line.replace('_', ' ').strip().startswith(title.upper())
if cond1 or cond2:
key = (title, author)
stories[key] = []
line = ''
break
if key:
stories[key].append(line)
return stories