-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathfetch_pinyin.py
260 lines (224 loc) · 8.33 KB
/
fetch_pinyin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#!/usr/bin/env python3
"""
Python3 script to fetch pinyin automatically, based on raw txt files. Results will be saved to dir "result".
Format of the raw txt file should be any of the following three types:
'''
中国
美国 49.9999
德国 97.0 # this is a comment
'''
After running this script, you get the following result (saved as result txt files):
'''
中国 zhong'guo
美国 mei'guo 49.9999
德国 de'guo 97.0
'''
As you can see, the result file can be imported to ibus-libpinyin directly.
To run this script:
1. Put your raw text file with words into "raw_dir".
2. Revise "raw_dir" vairable below.
3. Run this script (in terminal)
Author: Kevin Suo <suokunlong@126.com>
This script is written specially for libpinyin, but may be used for any other suitable purposes.
Licensed under GNU Public License v3 or above.
"""
import os
import urllib.request
import urllib.parse
from time import sleep
# Please revise this line before run.
raw_dir = r"/run/media/suokunlong/SSD-Data/soft/libpinyin-dict/raw"
def fetch_pinyin(words, sep="'"):
"""Fetch pinyin from https://zhongwenzhuanpinyin.51240.com/
inputs: words = [['中国'],
['美国', '49.9999'],
['德国', '97.0', '# this is a comment']]
output: dicts = [['中国', "zhong'guo", None, None],
['美国', "mei'guo", '49.9999', None],
['德国', "de'guo", '97.0', '# this is a comment']]
"""
print("process words: ", words)
url = "https://zhongwenzhuanpinyin.51240.com/web_system/51240_com_www/system/file/zhongwenzhuanpinyin/data/?ajaxtimestamp=1516872009015"
headers = {
'Accept': '*/*',
'Accept-Encoding': 'utf-8, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Proxy-Connection': 'keep-alive',
'User-Agent': '"Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"'
}
words_str = ""
for word in words:
words_str = words_str + word[0] + ","
post_values = {
'zwzyp_zhongwen': words_str,
'zwzyp_shengdiao': "0",
'zwzyp_wenzi': "0",
'zwzyp_jiange': "1",
'zwzyp_duozhongduyin': "0"}
#print("post values are: ", post_values)
data=urllib.parse.urlencode(post_values).encode('utf-8')
request = urllib.request.Request(url, headers=headers, data=data)
html = urllib.request.urlopen(request).read()
html = html.decode()
html = html.replace("""<div align="center"><textarea name="zhongwen" style="font-size: 18px;border: 1px solid #d9d9d9;height: 155px;width: 95%;" readonly="readonly">""","")
html = html.replace(""" , </textarea></div>""","")
pinyins = html.split(" , ")
#print(pinyins)
dicts = []
for i in range(len(words)):
l = len(words[i])
chars = words[i][0]
pinyin = pinyins[i].replace(" ", sep)
if l == 1:
freq = None
other = None
elif 1 < l <= 3:
freq = words[i][1]
other = None
if l == 3:
other = words[i][2]
else:
raise Exception("lenth of word list > 3: ", words[i])
dict = [chars, pinyin, freq, other]
dicts.append(dict)
# avoid flood to the server.
sleep(1)
return dicts
def get_words(lines):
"""
input: lines = ["中国",
"美国 49.9999",
"德国 97.0 # this is a comment"]
return: words = [['中国'],
['美国', '49.9999'],
['德国', '97.0', '# this is a comment']]
"""
words = []
for line in lines:
word = line.split(" ", maxsplit=2)
chars = word[0]
if not is_chinese(chars):
raise Exception("get_words_error", "Non-chinese word chars found in line: ", line)
if len(word) >= 2:
freq = word[1]
if not freq.replace(".", "").isdigit():
raise Exception("get_words_error", "non-numeric freq chars found in line: ", line)
words.append(word)
return words
def is_chinese(uchar):
"""判断一个unicode是否是汉字
http://blog.csdn.net/qinbaby/article/details/23201883
"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
def check_line(line):
"""
"""
line = line.strip()
if line.startswith("#") or line == "":
pass
else:
word = line.split(" ", maxsplit=2)
chars = word[0]
if not is_chinese(chars):
print("Line error: ", "Non-chinese word chars found in line: ", line)
return False
if len(word) >= 2:
freq = word[1]
if not freq.replace(".", "").isdigit():
print("get_words_error", "non-numeric freq chars found in line: ", line)
return False
return True
def check_raw_file(raw_file):
"""
"""
with open(raw_file,"r") as f_in:
lines = f_in.readlines()
errors = []
for i in range(len(lines)):
if (check_line(lines[i])):
pass
else:
errors.append([i+1, lines[i]])
if len(errors) > 0:
print("check raw file erros are: ", errors, "\n", raw_file)
return False
else:
return True
def check_raw_files(raw_dir):
"""
"""
for file_name in os.listdir(raw_dir):
# print("checking raw file: ", file_name)
raw_file = os.path.join(raw_dir, file_name)
if os.path.isdir(raw_file):
pass
else:
if check_raw_file(raw_file) == False:
raise Exception("Raw File Check Failed: ", raw_file)
def process_raw_file(raw_file, result_file):
""" 1. 一行一行读取
2. 碰到comment行或空行,如果lines_current里有要索取的词,则执行索取操作,写入索取结果,清零lines_current, 然后再写入comment或空行。
3. 每max_lines_per_fetch行时,强制索取。
4. 到结尾时,索取剩余的lines_current中的词。
"""
if os.path.isdir(raw_file) or os.path.isdir(result_file):
return False
with open(result_file,"w") as f_out:
with open(raw_file,"r") as f_in:
lines = f_in.readlines()
max_lines_per_fetch = 200
i = 1
lines_current = []
for line in lines:
line = line.strip()
if line.startswith("#") or line == "":
if len(lines_current) == 0: # there is no words to fetch
f_out.write(line+"\n")
else:
words = get_words(lines_current)
dicts = fetch_pinyin(words)
write_dicts(f_out, dicts)
lines_current.clear()
# remember to write the comment or blank line after fetch
f_out.write(line+"\n")
else:
lines_current.append(line)
if len(lines_current) >= max_lines_per_fetch or i==len(lines):
words = get_words(lines_current)
dicts = fetch_pinyin(words)
write_dicts(f_out, dicts)
lines_current.clear()
i += 1
def write_dicts(f_out, dicts):
"""
"""
for dict in dicts:
chars = dict[0]
pinyin = dict[1]
freq = dict[2]
# "other" is disregarded in below process.
other = dict[3]
if freq == None:
f_out.write(dict[0]+" "+dict[1]+"\n")
else:
f_out.write(chars+" "+pinyin+" "+freq+"\n")
def process_raw_files(raw_dir):
"""
"""
# check for errors in raw files before do anything else
check_raw_files(raw_dir)
result_dir = os.path.join(raw_dir, "result")
if not os.path.isdir(result_dir):
os.mkdir(result_dir)
for file_name in os.listdir(raw_dir):
print("Catch: ", file_name)
if os.path.isdir(file_name):
pass
else:
raw_file = os.path.join(raw_dir, file_name)
result_file = os.path.join(raw_dir, "result", file_name)
process_raw_file(raw_file, result_file)
process_raw_files(raw_dir)