-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexample.py
31 lines (21 loc) · 818 Bytes
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from readwiki.wiki_download import WIKIDownload
from readwiki.wiki_parse2doc import WIKIParse2Doc
# =====================================
# STEP 1 : DOWNLOAD WIKI DUMP
# =====================================
# DENPENDENCY: wget
# archive = '20200220'
# output_dir = './dump'
# print('Downloading dump:', archive)
# downloader = WIKIDownload(output_dir)
# xml_path, txt_path = downloader.run(
# archive, verbose=True
# )
# print('Index txt:', txt_path)
# print('Content xml:', xml_path)
# =====================================
# STEP 2 : PARSE XML TO HUMAN-READABLE
# =====================================
xml_path = './dump/zhwiki-20200220-pages-articles-multistream.xml.bz2'
WIKIParse2Doc(xml_path, './docs/words_txt').run(num=100)
WIKIParse2Doc(xml_path, './docs/words_md', markdown=True).run(num=100)