-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_manscript.py
executable file
·160 lines (150 loc) · 5.86 KB
/
convert_manscript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import re
def replace_special_characters(text):
"""用于处理用户输入的穆麟德转写特殊字符
"""
replacements = {
'zv': 'ž',
'sv': 'š',
'uv': 'ū'
}
for pattern, replacement in replacements.items():
text = text.replace(pattern, replacement)
return text
def deal_special_Latin(word):
"""
初步处理 拉丁转写 便于后续转为传统满文
适用于 穆麟德 转写
在 mulinde2manscript 中调用,不单独调用
"""
# 使用正则表达式删除 ' 符号前后的空格
word = re.sub(r'\s*\'\s*', "'", word)
vowel = ["a", "e", "i", "o", "u", "ū"]
i = 0
while i < len(word):
if word[i] == "g": ##case of "ng+consonant" or final "ng".
if i < len(word) - 1: ##not final "g"
if word[i+1] not in vowel and word[i-1] == "n": #letter "g" in "ng+consonant" case
word = word[:i-1] + "N" + word[i+1:]
elif i == len(word) - 1 and word[i-1] == "n": ##final "g", i.e. "ng" at the final part of a word.
word = word[:i-1] + "N"
i += 1
return word
def mulinde2manscript(latinword):
'''
定义 穆麟德 转 传统满文 函数
'''
ManchuScript_Latin_map = {
"a":"ᠠ", "e":"ᡝ", "i":"ᡳ",
"o":"ᠣ", "u":"ᡠ", "ū":"ᡡ", "n":"ᠨ",
"N":"ᠩ",
"k":"ᡴ", "g":"ᡤ", "h":"ᡥ",
"b":"ᠪ", "p":"ᡦ", "s":"ᠰ", "š":"ᡧ",
"t":"ᡨ", "d":"ᡩ", "l":"ᠯ", "m":"ᠮ",
"c":"ᠴ", "j":"ᠵ", "y":"ᠶ", "r":"ᡵ",
"f":"ᡶ", "w":"ᠸ", "k'":"ᠺ", "g'":"ᡬ",
"h'":"ᡭ", "ts'":"ᡮ", "ts":"ᡮᡟ", "dz":"ᡯ",
"dzi":"ᡯᡳ", "ž":"ᡰ", "sy":"ᠰᡟ", "c'y":"ᡱᡳ","jy":"ᡷᡳ",
",":"᠈", "<":"︽", ".":"᠉", ">":"︾",
"?":"︖", "!":"︕", ";":"︔", ":":"᠄", "[":"﹇", "]":"﹈",
"{":"︿", "}":"﹀", "\\":"᠁", "|":"︱", "-":" ",
}
# 调用上面定义的函数 deal_special_Latin
latinword = deal_special_Latin(latinword)
i = 0
manjuword = ""
while i < len(latinword):
found = False
# 优先匹配更长的子串(长度为 3 到 1)
for length in range(3, 0, -1):
substring = latinword[i:i + length]
if substring in ManchuScript_Latin_map:
manjuword += ManchuScript_Latin_map[substring]
i += length # 跳过匹配的子串长度
found = True
break
if not found: # 如果没有匹配到,直接保留原字符
manjuword += latinword[i]
i += 1
# 使用正则表达式,在 `᠈` 或 `᠉` 前添加空格(如果没有空格)
manjuword = re.sub(r"(?<!\s)([᠈᠉])", r" \1", manjuword)
return manjuword
def manscript2mulinde(manjuword):
'''
定义 传统满文 转 穆麟德 函数
'''
ManchuScript_Latin_map = {
"a":"ᠠ", "e":"ᡝ", "i":"ᡳ",
"o":"ᠣ", "u":"ᡠ", "ū":"ᡡ", "n":"ᠨ",
"N":"ᠩ",
"k":"ᡴ", "g":"ᡤ", "h":"ᡥ",
"b":"ᠪ", "p":"ᡦ", "s":"ᠰ", "š":"ᡧ",
"l":"ᠯ", "m":"ᠮ",
"c":"ᠴ", "j":"ᠵ", "y":"ᠶ", "r":"ᡵ",
"f":"ᡶ", "w":"ᠸ", "k'":"ᠺ", "g'":"ᡬ",
"h'":"ᡭ", "ts'":"ᡮ", "ts":"ᡮᡟ", "dz":"ᡯ",
"dzi":"ᡯᡳ", "ž":"ᡰ", "sy":"ᠰᡟ", "c'y":"ᡱᡳ","jy":"ᡷᡳ",
",":"᠈", "<":"︽", ".":"᠉", ">":"︾",
"?":"︖", "!":"︕", ";":"︔", ":":"᠄", "[":"﹇", "]":"﹈",
"{":"︿", "}":"﹀", "\\":"᠁", "|":"︱", "-":" ",
}
# 创建反向映射字典
reversed_dict = {v: k for k, v in ManchuScript_Latin_map.items()}
reversed_dict["ᡨ᠋"] = "t"
reversed_dict["ᡨ"] = "t"
reversed_dict["ᡩ᠋"] = "d"
reversed_dict["ᡩ"] = "d"
i = 0
latinword = ''
while i < len(manjuword):
found = False
# 尝试匹配最长的 Manchu 字符串
for length in range(2, 0, -1): # 尝试匹配长度为2或1的子字符串
substring = manjuword[i:i+length]
if substring in reversed_dict:
latinword += reversed_dict[substring]
i += length
found = True
break
if not found: # 如果当前字符不在字典中,直接保留
latinword += manjuword[i]
i += 1
latinword = latinword.replace("N","ng")
return latinword
def detect_script(input_text):
"""
判断输入是传统满文(manscript)还是拉丁文(mulinde)。
如果前 5 个字符不包含英文字母,则认为是传统满文,否则是拉丁文。
返回输入类型:'manscript' 或 'mulinde'。
"""
# 定义英文26个字母
english_letters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
# 检查前5个字符
for char in input_text[:5]:
if char in english_letters:
return "mulinde" # 输入为拉丁文
return "manscript" # 输入为传统满文
def convert_manscript(text):
"""
根据输入的文本类型调用相应的转换函数。
"""
text = replace_special_characters(text)
script_type = detect_script(text)
if script_type == "manscript":
return manscript2mulinde(text)
elif script_type == "mulinde":
return mulinde2manscript(text)
else:
raise ValueError("Unknown script type.")
def convert_manscript2(text):
"""
根据输入的文本类型调用相应的转换函数。
如果输入的是穆麟德,则不转,如果输入的是传统满文,则转为穆麟德
"""
text = replace_special_characters(text)
script_type = detect_script(text)
if script_type == "manscript":
return manscript2mulinde(text)
elif script_type == "mulinde":
return text
else:
raise ValueError("Unknown script type.")