-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_kab_chars.py
137 lines (119 loc) · 5.96 KB
/
check_kab_chars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""
Check Kabyle Sentences for Non-Standard Characters and Optionally Fix Them
This tool reads a text file containing Kabyle sentences (one per line) and
checks each sentence for any alphabetical characters that are not in the
allowed standardized set. Numbers, punctuation, and whitespace are ignored.
Allowed characters (case-insensitive):
['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t',
'u','v','w','x','y','z','č','ḍ','ǧ','ḥ','ɣ','ṛ','ṣ','ṭ','ɛ','ẓ']
If the --fix flag is provided, the tool will automatically replace known disallowed characters
according to the following mapping (applied in order of decreasing key length):
- "ţţ" → "tt" (two consecutive ţ's → "tt")
- "țț" → "tt" (two consecutive ț's → "tt")
- ϵ → ɛ (Greek lunate epsilon to Latin small open e)
- ε → ɛ (Greek small epsilon to Latin small open e)
- γ → ɣ (Greek small gamma to Latin small gamma)
- Γ → Ɣ (Greek capital Gamma to Latin capital Ɣ)
- Σ → Ɛ (Greek capital Sigma to Latin capital Ɛ)
- Ԑ → Ɛ (Cyrillic letter Ԑ to Latin capital Ɛ)
- ğ → ǧ (Latin letter ğ to Latin letter ǧ)
- ş → ṣ (Latin letter ş to Latin letter ṣ)
Usage examples:
Check sentences without fixing:
python3 check_kab_chars.py --input_file kab.txt
Check and fix (writes corrected sentences to kab_fixed.txt by default):
python3 check_kab_chars.py --input_file kab.txt --fix
Or specify a custom output file:
python3 check_kab_chars.py --input_file kab.txt --fix --fixed_output my_kab.txt
"""
import argparse
import unicodedata
def find_disallowed(sentence, allowed_set):
"""
For each character in the sentence that is alphabetic, check if its lowercase
version is in allowed_set. Return a set of all characters (in their original form)
that are not allowed.
"""
disallowed = set()
for char in sentence:
if char.isalpha() and (char.lower() not in allowed_set):
disallowed.add(char)
return disallowed
def fix_sentence(sentence, fix_mapping):
"""
Normalize the sentence to NFC form and replace occurrences of disallowed characters
using the provided mapping. The mapping is applied in order of decreasing key length
so that multi-character sequences are processed first.
"""
fixed = unicodedata.normalize('NFC', sentence)
# Process keys in order of decreasing length (longer keys first)
for key in sorted(fix_mapping, key=lambda k: -len(k)):
fixed = fixed.replace(key, fix_mapping[key])
return fixed
def main():
parser = argparse.ArgumentParser(
description="Check Kabyle sentences for non-standard characters and optionally fix them."
)
parser.add_argument("--input_file", default="kab.txt",
help="Input text file with one Kabyle sentence per line (default: kab.txt)")
parser.add_argument("--fix", action="store_true",
help="Automatically fix known disallowed characters and output a corrected file.")
parser.add_argument("--fixed_output", default=None,
help="Output file for fixed sentences (default: input filename with '_fixed' appended)")
args = parser.parse_args()
# Allowed characters (lowercase) as provided.
allowed_chars = [
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t',
'u','v','w','x','y','z','č','ḍ','ǧ','ḥ','ɣ','ṛ','ṣ','ṭ','ɛ','ẓ'
]
allowed_set = set(allowed_chars)
problematic_sentences = 0
total_sentences = 0
with open(args.input_file, "r", encoding="utf-8") as infile:
for i, line in enumerate(infile, start=1):
sentence = line.strip()
if not sentence:
continue
total_sentences += 1
disallowed = find_disallowed(sentence, allowed_set)
if disallowed:
problematic_sentences += 1
sorted_disallowed = sorted(disallowed)
print(f"Line {i}: {sentence}")
print(f" Disallowed characters: {', '.join(sorted_disallowed)}\n")
print(f"Checked {total_sentences} sentences. Found {problematic_sentences} sentence(s) with disallowed characters.")
if args.fix:
# Mapping for fixes. Multi-character keys come first.
fix_mapping = {
'ţţ': 'tt', # Two consecutive ţ's → "tt"
'țț': 'tt', # Two consecutive ț's → "tt"
'ϵ': 'ɛ', # Greek lunate epsilon → Latin small open e
'ε': 'ɛ', # Greek small epsilon → Latin small open e
'γ': 'ɣ', # Greek small gamma → Latin small gamma
'Γ': 'Ɣ', # Greek capital Gamma → Latin capital Ɣ
'Σ': 'Ɛ', # Greek capital Sigma → Latin capital Ɛ
'Ԑ': 'Ɛ', # Cyrillic letter Ԑ → Latin capital Ɛ
'ğ': 'ǧ', # Latin letter ğ → Latin letter ǧ
'ş': 'ṣ' # Latin letter ş → Latin letter ṣ
}
# Determine output filename.
if args.fixed_output:
output_file = args.fixed_output
else:
if '.' in args.input_file:
base, ext = args.input_file.rsplit('.', 1)
output_file = f"{base}_fixed.{ext}"
else:
output_file = f"{args.input_file}_fixed.txt"
fixed_count = 0
with open(args.input_file, "r", encoding="utf-8") as infile, \
open(output_file, "w", encoding="utf-8") as outfile:
for line in infile:
fixed_line = fix_sentence(line, fix_mapping)
if fixed_line != line:
fixed_count += 1
outfile.write(fixed_line)
print(f"Fixed {fixed_count} lines. Corrected file saved as '{output_file}'.")
if __name__ == "__main__":
main()