-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathadd_forvo.py
55 lines (48 loc) · 2.26 KB
/
add_forvo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import glob
import subprocess
from datasets import Dataset, Audio, concatenate_datasets
import pandas as pd
from typing import List
def add_language() -> List[dict]:
# a path looks like this: "additional/Xhosa/Audio/{$AUDIOFILE}.wav" or "additional/Xhosa/{$PRONFILE}.txt
path = "./additional/"
if not os.path.exists(path):
path = "../additional/"
langs = [l for l in os.listdir(path) if not l.startswith(".")]
for l in langs:
assert l.istitle(), print("The first letter of the language must be upper-case (i.e., title case)")
audio_path = path + "/" + l + "/Audio/wav"
text_file = path + "/" + l + "/ipa.txt"
audio_files = os.listdir(audio_path) # -> list of files in the directory
with open(text_file, "r") as f:
# a line will look like this:
# pronunciation_xh_amabhaca amabaːǀa
ipas = [l.strip() for l in f.readlines() if l != "\n"]
assert not ipas[0].endswith("\n"), print("line break letter found at the end of the line.")
assert len(ipas) == len(audio_files), print("Numbers of audio files and IPAs don't match")
ds = dict()
ds["path"] = list()
ds["ipa"] = list()
ds["sentence"] = list()
for l in ipas:
# print(l.split(" "))
filename = l.split()[0]
pron = " ".join(l.split()[1:])
sent = " ".join(filename.split("_")[2:])
print(filename)
filename = filename + ".wav"
assert filename in audio_files, print("Audio file not found. Check the file name or the directory.")
file_path = audio_path + "/" + filename
ds["path"].append(file_path)
ds["ipa"].append(pron)
ds["sentence"].append(sent)
df = pd.DataFrame(ds) # -> DataFrame
ds = Dataset.from_pandas(df) # -> Dataset
# Read binary data (array) of the audio files
audio_files_with_path = glob.glob(audio_path + "/*")
audio_data = Dataset.from_dict({"audio": audio_files_with_path}).cast_column("audio", Audio(sampling_rate=48000))
# -> Dataset
# Concatenate ds and the audio column w.r.t. the column
ds = concatenate_datasets([ds, audio_data], axis=1)
return ds