From 9c85534199126b0ab8ccd0c416a09ee314b4f17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Kyj=C3=A1nek?= Date: Sat, 7 Sep 2019 12:45:23 +0200 Subject: [PATCH] Create scripts for phonetic transcription of Czech, Slovak and Polish. --- CHANGELOG.txt | 4 + README.md | 70 +++++++++++++++ phon_czech.py | 237 +++++++++++++++++++++++++++++++++++++++++++++++++ phon_polish.py | 236 ++++++++++++++++++++++++++++++++++++++++++++++++ phon_slovak.py | 226 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 773 insertions(+) create mode 100644 CHANGELOG.txt create mode 100644 README.md create mode 100644 phon_czech.py create mode 100644 phon_polish.py create mode 100644 phon_slovak.py diff --git a/CHANGELOG.txt b/CHANGELOG.txt new file mode 100644 index 0000000..9cd521d --- /dev/null +++ b/CHANGELOG.txt @@ -0,0 +1,4 @@ +Version 1 (v1) [released in 7 Sept 2019] +- create script for automatic phonetic transcription of Czech according to listed linguistic works (in README.md) +- create script for automatic phonetic transcription of Slovak according to listed linguistic works (in README.md) +- create script for automatic phonetic transcription of Polish according to listed linguistic works (in README.md) diff --git a/README.md b/README.md new file mode 100644 index 0000000..63df3c2 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +# Automatic phonetic transcription of the Czech, Slovak and Polish languages +This repository contains codes of rule-based approach to the phonetics transcription of the Czech, Slovak and Polish languages into the [International Phonetic Alphabet](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet) (IPA). Used rules and IPA signs are based on the phonologic, phonetic, and orthoepic studies (listed below) of the mentioned West-Slavic languages. + +`CHANGELOG.txt` contains list of changes in each version. Current version is this one (version 1). + +## Usage +These scripts can be used both as imported in any project, and as shell scripts. Bellow, three examples how to use them are shown. + +**1. Import as the function to your Python3 project.** +```python +from phon_czech import ipa_czech +from phon_slovak import ipa_slovak +from phon_polish import ipa_polish + +word1 = ipa_czech('všichni') +text1 = ipa_czech('Všichni lidé rodí se svobodní a sobě rovní co do důstojnosti a práv.') + +word2 = ipa_slovak('všetci') +text2 = ipa_slovak('Všetci ľudia sa rodia slobodní a rovní si do dôstojnosti a práv.') + +word3 = ipa_polish('wszyscy') +text3 = ipa_polish('Wszyscy ludzie rodzą się wolni i równi pod względem godności i praw.') + +print(word1, word2, word3, sep='\n') +print(text1, text2, text3, sep='\n') +``` + +**2. Read from stdin in the shell pipeline.** +```bash +echo -e 'všichni' | python3 phon_czech.py +echo -e 'Všichni lidé rodí se svobodní a sobě rovní co do důstojnosti a práv.' | python3 phon_czech.py + +echo -e 'všetci' | python3 phon_slovak.py +echo -e 'Všetci ľudia sa rodia slobodní a rovní si do dôstojnosti a práv.' | python3 phon_slovak.py + +echo -e 'wszyscy' | python3 phon_polish.py +echo -e 'Wszyscy ludzie rodzą się wolni i równi pod względem godności i praw.' | python3 phon_polish.py +``` + +```bash +cat 'path-to-input-file' | python3 phon_czech.py +cat 'path-to-input-file' | python3 phon_slovak.py +cat 'path-to-input-file' | python3 phon_polish.py +``` + +**3. Read from file in shell pipeline.** +```bash +python3 phon_czech.py 'path-to-input-file' +python3 phon_slovak.py 'path-to-input-file' +python3 phon_polish.py 'path-to-input-file' +``` + +## Based on these studies +- BALOWSKI, Mieczysław. 1993. Fonetika a fonologie současné polštiny. Praha: Karolinum. ISBN: 80-7066-793-1. +- DUDÁŠOVÁ-KRIŠŠÁKOVÁ, Júlia. 1999. Fonologický systém spisovnej slovenčiny a poľštiny z typologického hľadiska. Slavica Slovaca. 34(1), 16-24. ISSN: 0037-6787. +- KAJANOVÁ-SCHULZOVÁ, Oľga. 1970. Úvod do fonetiky slovenčiny. Bratislava: Slovenské pedagogické nakladateľstvo. +- KRÁĽ, Ábeľ; SABOL, Ján. 1989. Fonetika a fonológia. Bratislava: Slovenské pedagogické nakladateľstvo. ISBN: 80-08-00036-8. +- KRČMOVÁ, Marie. 2016. Úvod do fonetiky a fonologie pro bohemisty. Ostrava: Universitas Ostraviensis. ISBN: 978-80-7368-636-9. +- KRČMOVÁ, Marie. 2017. TRANSKRIPCE. In: Petr Karlík, Marek Nekula, Jana Pleskalová (eds.), CzechEncy - Nový encyklopedický slovník češtiny. +URL: https://www.czechency.org/slovnik/TRANSKRIPCE. +- KRČMOVÁ, Marie. 2017. ORTOEPIE. In: Petr Karlík, Marek Nekula, Jana Pleskalová (eds.), CzechEncy - Nový encyklopedický slovník češtiny. +URL: https://www.czechency.org/slovnik/ORTOEPIE. +- LIPOWSKI, Jaroslav. 2011. Operatívna fonetika slovenčiny, češtiny a poľštiny. Wrocław: Wydawnictwo Uniwersytetu Wrocławskiego. ISBN: 978-80-7294-511-5. +- LOTKO, Edvard. 1999. Ke konfrontaci příbuzných jazyků. In: Srovnávací a bohemistické studie. Olomouc: Vydavatelství Univerzity Palackého, 9-19. ISBN: 978-80-244-2201-5. +- PALKOVÁ, Zdena. 1994. Fonetika a fonologie češtiny. Praha: Karolinum. ISBN: 80-7066-843-1. +- PAULINY, Eugen. 1979. Slovenská fonológia. Bratislava: Slovenské pedagogické nakladateľstvo. +- ZEMAN, Jiří. 2008. Základy české ortoepie. Hradec Králové: Gaudeamus. ISBN: 978-80-7041-778-2. + +- Fonetická transkripce češtiny. Fonetický ústav, Filozofická fakulta, Univerzita Karlova. URL: https://fonetika.ff.cuni.cz/o-fonetice/foneticka-transkripce/o-foneticke-transkripci/. +- International Phonetic Alphabet. URL: https://www.internationalphoneticassociation.org/redirected_home. diff --git a/phon_czech.py b/phon_czech.py new file mode 100644 index 0000000..387ef76 --- /dev/null +++ b/phon_czech.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Phonetic transcription of Czech text to IPA.""" + +import re +import sys + + +# function for the phonetic transcription of Czech language to IPA +def ipa_czech(text): + """Phonetic transcription to IPA of given Czech text or word.""" + # set transription table (IPA) + vowels = {'a': 'a', 'e': 'ɛ', 'i': 'ɪ', 'y': 'ɪ', 'o': 'ɔ', 'u': 'u', + 'á': 'aː', 'é': 'ɛː', 'í': 'iː', 'ý': 'iː', 'ó': 'ɔː', + 'ú': 'uː', 'ů': 'uː', 'ě': 'ɛ'} + + sonors = {'l': 'l', 'm': 'm', 'n': 'n', 'ň': 'ɲ', 'r': 'r', 'j': 'j'} + + voice_voice = {'dz': 'd͡z', 'dž': 'd͡ʒ', 'v': 'v', 'g': 'ɡ', 'b': 'b', + 'z': 'z', 'ž': 'ʒ', 'd': 'd', 'ď': 'ɟ', 'h': 'ɦ', + 'ch': 'ɣ', 'x': 'ks', 'w': 'v', 'ř': 'r̝', 'q': 'kv'} + + voice_voiceless = {'dz': 't͡s', 'dž': 't͡ʃ', 'v': 'f', 'g': 'k', 'b': 'p', + 'z': 's', 'ž': 'ʃ', 'd': 't', 'ď': 'c', 'h': 'x', + 'ch': 'x', 'x': 'ks', 'w': 'f', 'ř': 'r̝̊', 'q': 'kf'} + + voiceless_voiceless = {'c': 't͡s', 'č': 't͡ʃ', 'f': 'f', 'k': 'k', + 'p': 'p', 's': 's', 'š': 'ʃ', 't': 't', 'ť': 'c'} + + voiceless_voice = {'c': 'd͡z', 'č': 'd͡ʒ', 'f': 'v', 'k': 'ɡ', 'p': 'b', + 's': 'z', 'š': 'ʒ', 't': 'd', 'ť': 'ɟ'} + + # exceptions + vowel_prefixes = ('nade', 'obe', 'pode', 'přede', 'roze', 'se', 've', + 'vze', 'ze', 'ne', 'vele', 'ante', 'de', 'pre', 're', + 'vice', 'na', 'za', 'leda', 'pa', 'pra', 'sotva', 'ana', + 'dia', 'extra', 'hepta', 'hexa', 'infra', 'intra', + 'kontra', 'meta', 'para', 'supra', 'tetra', 'ultra', + 'mimo', 'místo', 'okolo', 'polo', 'skoro', 'alo', + 'hetero', 'homo', 'hypo', 'iso', 'kvadro', 'makro', + 'mezzo', 'mikro', 'proto', 'pseudo', 'retro', 'mono') + + # TODO: foreign words + + # split on clauses + text = text.replace('...', '.') + parts = re.split(r'[,;\.\!\?\"\-\–$]', text) + delimiters = [l for l in text if l in ',;.!?"-–'] + + # transcript clauses + transcripted_parts = list() + for part in parts: + # check input + if not part: + transcripted_parts.append('') + continue + + # prepare text to list of letters to transcript + part = part.lower().strip() + part = part.replace('ch', 'A').replace('dz', 'B').replace('dž', 'C') + digraphs = {'A': 'ch', 'B': 'dz', 'C': 'dž'} + part = list(part) + for l in range(len(part)): + if part[l] in digraphs: + part[l] = digraphs[part[l]] + + # transcripted input + ipa = [l for l in part] + + # find out intervals for neutralization and assimilation + posit_vowel = [-1] + [i for i in range(len(part)) if part[i] in vowels] + posit_sonor = [i for i in range(len(part)) if part[i] in sonors] + + # neutralization + j = posit_vowel[-1] + if posit_sonor and posit_sonor[-1] > posit_vowel[-1]: + j = posit_sonor[-1] + + i = len(part) - 1 + while i > j: + if part[i] in voice_voiceless: + ipa[i] = voice_voiceless[part[i]] + elif part[i] in voiceless_voiceless: + ipa[i] = voiceless_voiceless[part[i]] + elif part[i] in sonors: + ipa[i] = sonors[part[i]] + i -= 1 + + # transctiption and assimilation + while posit_vowel: + i, k = j, j + j = posit_vowel.pop() + voice = None # assimil. type (N=uknown, T=voice, F=voiceless) + while i > j: + # transcription of vowels + if part[i] in vowels: + # diphtongs ou, eu, au + if part[i] in 'aeo' and len(part) > i+1 \ + and part[i+1] == 'u': + test = [True if p == ''.join(part[i+1-len(p):i+1]) + else False + for p in vowel_prefixes] + if any(test): + ipa[i] = vowels[part[i]] + ' ʔ' + else: + ipa[i] = vowels[part[i]] + 'u̯' + ipa[i+1] = '' + # i/í preceeding + elif part[i-1] in 'ií': + ipa[i] = 'j ' + vowels[part[i]] + # otherwise + else: + ipa[i] = vowels[part[i]] + # initial of word (glotal plosive) + if i == 0 or part[i-1] == ' ' and part[i-2] in vowels: + ipa[i] = 'ʔ ' + ipa[i] + + # transcription of sonors and consonants + elif k != i: + # sonors + if part[i] in sonors: + voice = None + # m, n + if part[i] in 'mn': + # nn + if part[i] == 'n' and part[i+1] == 'n': + ipa[i] = '' + # nk, ng + elif part[i] == 'n' and part[i+1] in 'kg': + ipa[i] = 'ŋ' + # mv, mf + elif part[i] == 'm' and part[i+1] in 'vf': + ipa[i] = 'ɱ' + # ni, ní + elif part[i] == 'n' and part[i+1] in 'ií': + ipa[i] = 'ɲ' + # mně, mě, ně + elif part[i+1] == 'ě': + if part[i] == 'n': + ipa[i] = 'ɲ' + else: + ipa[i] = 'm ɲ' + # otherwise + else: + ipa[i] = sonors[part[i]] + # otherwise + else: + ipa[i] = sonors[part[i]] + # kk + elif part[i] == 'k' and part[i+1] == 'k': + ipa[i] = '' + # choose type of assimilation + elif voice is None: + # voiced + if part[i] in voice_voice: + voice = True + # v + if part[i] == 'v': + voice = None + # bě, vě + if part[i] in 'bv' and part[i+1] == 'ě': + ipa[i] = voice_voice[part[i]] + ' j' + # di, dí, dě + elif part[i] == 'd' and part[i+1] in 'iíě': + ipa[i] = 'ɟ' + # ř + elif part[i] == 'ř' and i != 0: + if part[i-1] in voiceless_voiceless: + ipa[i] = voice_voiceless[part[i]] + voice = False + else: + ipa[i] = voice_voice[part[i]] + # otherwise + else: + ipa[i] = voice_voice[part[i]] + # voiceless + elif part[i] in voiceless_voiceless: + voice = False + # pě, fě + if part[i] in 'pf' and part[i+1] == 'ě': + ipa[i] = voiceless_voiceless[part[i]] + ' j' + # ti, tí, tě + elif part[i] == 't' and part[i+1] in 'iíě': + ipa[i] = 'c' + # otherwise + else: + ipa[i] = voiceless_voiceless[part[i]] + # assimilation + else: + # voiced group + if voice is True and part[i] in voice_voice: + ipa[i] = voice_voice[part[i]] + elif voice is True and part[i] in voiceless_voice: + ipa[i] = voiceless_voice[part[i]] + # voiceless group + elif voice is False and part[i] in voice_voiceless: + ipa[i] = voice_voiceless[part[i]] + elif voice is False and part[i] in voiceless_voiceless: + ipa[i] = voiceless_voiceless[part[i]] + + i -= 1 + + # clean empty cells and save transcripted clauses + ipa = list(filter(None, ipa)) + transcripted_parts.append(ipa) + + # return transcripted text + transcripted_parts = [' '.join(part) for part in transcripted_parts] + transcripted = '' + i = 0 + while i < len(delimiters): + transcripted += transcripted_parts[i] + delimiters[i] + i += 1 + if i < len(transcripted_parts): + transcripted += transcripted_parts[-1] + + transcripted = re.sub(r'\.|\?|\!|\;|\"', ' || ', transcripted) + transcripted = re.sub(r'\,|\-|\–', ' | ', transcripted) + return transcripted + + +# running script if it is used in shell (with stdin or path to file) +if __name__ == '__main__': + + if not sys.stdin.isatty(): # read from stdin + for line in sys.stdin: + print(ipa_czech(line.strip()), sep='\t') + + else: # read from file + if len(sys.argv) == 2: + with open(sys.argv[1], mode='r', encoding='utf-8') as f: + for line in f: + print(ipa_czech(line.strip()), sep='\t') + else: + print('Error: Use script in pipeline or give the path ' + 'to the relevant file in the first argument.') diff --git a/phon_polish.py b/phon_polish.py new file mode 100644 index 0000000..3af5e35 --- /dev/null +++ b/phon_polish.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Phonetic transcription of Polish text to IPA.""" + +import re +import sys + + +# function for the phonetic transcription of Polish language to IPA +def ipa_polish(text): + """Phonetic transcription to IPA of given Polish text or word.""" + # set transription table (IPA) + vowels = {'a': 'a', 'ą': 'ɔ', 'e': 'ɛ', 'ę': 'ɛ', 'i': 'i', 'o': 'ɔ', + 'u': 'u', 'ó': 'u', 'y': 'ɨ'} + + sonors = {'j': 'j', 'm': 'm', 'n': 'n', 'ń': 'ɲ', 'ni': 'ɲ', 'r': 'r', + 'l': 'l', 'ł': 'w'} + + voice_voice = {'b': 'b', 'd': 'd', 'g': 'ɡ', 'w': 'v', 'z': 'z', + 'dź': 'd͡ʑ', 'dzi': 'd͡ʑ', 'ź': 'ʑ', 'zi': 'ʑ', 'dz': 'd͡z', + 'dż': 'd͡ʐ', 'ż': 'ʐ', 'rz': 'ʐ', 'h': 'ɣ', 'ch': 'ɣ', + 'v': 'v', 'q': 'kv', 'x': 'ks'} + + voice_voiceless = {'b': 'p', 'd': 't', 'g': 'k', 'w': 'f', 'z': 's', + 'dź': 't͡ɕ', 'dzi': 't͡ɕ', 'ź': 'ɕ', 'zi': 'ɕ', + 'dz': 't͡s', 'dż': 'd͡ʐ', 'ż': 'ʂ', 'rz': 'ʂ', 'h': 'x', + 'ch': 'x', 'v': 'f', 'q': 'kf', 'x': 'ks'} + + voiceless_voiceless = {'p': 'p', 't': 't', 'k': 'k', 'f': 'f', 's': 's', + 'ć': 't͡ɕ', 'ci': 't͡ɕ', 'ś': 'ɕ', 'si': 'ɕ', + 'c': 't͡s', 'cz': 't͡ʂ', 'sz': 'ʂ'} + + voiceless_voice = {'p': 'b', 't': 'd', 'k': 'ɡ', 'f': 'v', 's': 'z', + 'ć': 'd͡ʑ', 'ci': 'd͡ʑ', 'ś': 'ʑ', 'si': 'ʑ', + 'c': 'd͡z', 'cz': 'd͡ʐ', 'sz': 'ʐ'} + + # exceptions + n_nasals = ('d', 'g', 'dz', 'c', 'k', 't', 'cz', 'dż') + w_nasals = ('w', 'z', 'ź', 'zi', 'rz', 'ż', 'ś', 'si', 'ch', + 'h', 'f', 's', 'sz') + m_nasals = ('b', 'p') + ni_nasals = ('dź', 'dzi', 'ci', 'ć') + consonants_soft = ('dzi', 'zi', 'ci', 'si', 'ni') + + # TODO: foreign words + + # split on clauses + text = text.replace('...', '.') + parts = re.split(r'[,;\.\!\?\"\-\–$]', text) + delimiters = [l for l in text if l in ',;.!?"-–'] + + # transcript clauses + transcripted_parts = list() + for part in parts: + # check input + if not part: + transcripted_parts.append('') + continue + + # prepare text to list of letters to transcript + part = part.lower().strip() + part = part.replace('dzi', 'A').replace('dź', 'B').replace('rz', 'C') + part = part.replace('dż', 'D').replace('ch', 'E').replace('sz', 'F') + part = part.replace('cz', 'G').replace('dz', 'H').replace('zi', 'I') + part = part.replace('ci', 'J').replace('si', 'K').replace('ni', 'L') + digraphs = {'A': 'dzi', 'B': 'dź', 'C': 'rz', 'D': 'dż', 'E': 'ch', + 'F': 'sz', 'G': 'cz', 'H': 'dz', 'I': 'zi', 'J': 'ci', + 'K': 'si', 'L': 'ni'} + part = list(part) + for l in range(len(part)): + if part[l] in digraphs: + part[l] = digraphs[part[l]] + + # transcripted input + ipa = [l for l in part] + + # find out intervals for neutralization and assimilation + posit_vowel = [-1] + [i for i in range(len(part)) if part[i] in vowels] + posit_sonor = [i for i in range(len(part)) if part[i] in sonors] + + # neutralization + j = posit_vowel[-1] + if posit_sonor and posit_sonor[-1] > posit_vowel[-1]: + j = posit_sonor[-1] + + i = len(part) - 1 + while i > j: + if part[i] in voice_voiceless: + ipa[i] = voice_voiceless[part[i]] + elif part[i] in voiceless_voiceless: + ipa[i] = voiceless_voiceless[part[i]] + elif part[i] in sonors: + ipa[i] = sonors[part[i]] + i -= 1 + + # transctiption and assimilation + while posit_vowel: + i, k = j, j + j = posit_vowel.pop() + voice = None # assimil. type (N=uknown, T=voice, F=voiceless) + while i > j: + # transcription of soft consonants + if part[i] in consonants_soft: + # dzi, zi + if part[i] in ('dzi', 'zi'): + voice = True + if i < len(part) - 1 and part[i+1] in vowels: + ipa[i] = voice_voice[part[i]] + else: + ipa[i] = voice_voice[part[i]] + ' i' + # ci, si + elif i < len(part) - 1 and part[i] in ('ci', 'si'): + voice = False + if part[i+1] in vowels: + ipa[i] = voiceless_voiceless[part[i]] + else: + ipa[i] = voiceless_voiceless[part[i]] + ' i' + # ni + else: + voice = None + if i < len(part) - 1 and part[i+1] in vowels: + ipa[i] = sonors[part[i]] + else: + ipa[i] = sonors[part[i]] + ' i' + # transcription of vowels + elif part[i] in vowels: + # initial of word (glotal plosive) + if i == 0 or part[i-1] == ' ': + ipa[i] = 'ʔ ' + vowels[part[i]] + # ii + elif (part[i] == 'i' and i < len(part) - 1 + and part[i+1] == 'i'): + ipa[i] = 'j' + # ą, ę + elif part[i] in 'ąę': + if i is len(part) - 1: + if part[i] == 'ą': + ipa[i] = vowels[part[i]] + ' u̯' + else: + ipa[i] = vowels[part[i]] + elif part[i+1] in n_nasals: + ipa[i] = vowels[part[i]] + ' ŋ' + elif part[i+1] in m_nasals: + ipa[i] = vowels[part[i]] + ' m' + elif part[i+1] in w_nasals: + ipa[i] = vowels[part[i]] + ' u̯' + elif part[i+1] in ni_nasals: + ipa[i] = vowels[part[i]] + ' ɲ' + else: + ipa[i] = vowels[part[i]] + # otherwise + else: + ipa[i] = vowels[part[i]] + + # transcription of sonors and consonants + elif k != i: + # sonors + if part[i] in sonors: + voice = None + ipa[i] = sonors[part[i]] + # choose type of assimilation + elif voice is None: + # regression or progression of w + if part[i] in 'w' and i > 0: + if part[i-1] in 'tks': + ipa[i] = voice_voiceless[part[i]] + voice = False + else: + ipa[i] = voice_voice[part[i]] + voice = True + # regression or progression of ż and rz + elif part[i] in ('ż', 'rz') and i > 0: + if part[i-1] in 'ae': + ipa[i] = voice_voiceless[part[i]] + voice = False + else: + ipa[i] = voice_voice[part[i]] + voice = True + # voiced + elif part[i] in voice_voice: + ipa[i] = voice_voice[part[i]] + voice = True + # voiceless + elif part[i] in voiceless_voiceless: + ipa[i] = voiceless_voiceless[part[i]] + voice = False + # assimilation + else: + # voiced group + if voice is True and part[i] in voice_voice: + ipa[i] = voice_voice[part[i]] + elif voice is True and part[i] in voiceless_voice: + ipa[i] = voiceless_voice[part[i]] + # voiceless group + elif voice is False and part[i] in voice_voiceless: + ipa[i] = voice_voiceless[part[i]] + elif voice is False and part[i] in voiceless_voiceless: + ipa[i] = voiceless_voiceless[part[i]] + + i -= 1 + + # clean empty cells and save transcripted clauses + ipa = list(filter(None, ipa)) + transcripted_parts.append(ipa) + + # return transcripted text + transcripted_parts = [' '.join(part) for part in transcripted_parts] + transcripted = '' + i = 0 + while i < len(delimiters): + transcripted += transcripted_parts[i] + delimiters[i] + i += 1 + if i < len(transcripted_parts): + transcripted += transcripted_parts[-1] + + transcripted = re.sub(r'\.|\?|\!|\;|\"', ' || ', transcripted) + transcripted = re.sub(r'\,|\-|\–', ' | ', transcripted) + return transcripted + + +# running script if it is used in shell (with stdin or path to file) +if __name__ == '__main__': + + if not sys.stdin.isatty(): # read from stdin + for line in sys.stdin: + print(ipa_polish(line.strip()), sep='\t') + + else: # read from file + if len(sys.argv) == 2: + with open(sys.argv[1], mode='r', encoding='utf-8') as f: + for line in f: + print(ipa_polish(line.strip()), sep='\t') + else: + print('Error: Use script in pipeline or give the path ' + 'to the relevant file in the first argument.') diff --git a/phon_slovak.py b/phon_slovak.py new file mode 100644 index 0000000..cd18257 --- /dev/null +++ b/phon_slovak.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Phonetic transcription of Slovak language.""" + +import re +import sys + + +# function for the phonetic transcription of Slovak language to IPA +def ipa_slovak(text): + """Phonetic transcription to IPA of given Slovak text or word.""" + # set transription table (IPA) + vowels = {'a': 'a', 'e': 'ɛ', 'i': 'ɪ', 'y': 'ɪ', 'o': 'ɔ', 'u': 'u', + 'á': 'aː', 'é': 'ɛː', 'í': 'iː', 'ý': 'iː', 'ó': 'ɔː', + 'ú': 'uː', 'ô': 'u̯ɔ', 'ä': 'æ'} + + sonors = {'l': 'l', 'm': 'm', 'n': 'n', 'ň': 'ɲ', 'r': 'r', 'j': 'j', + 'ŕ': 'rː', 'ĺ': 'ɭː', 'ľ': 'ʎ'} + + voice_voice = {'dz': 'd͡z', 'dž': 'd͡ʒ', 'v': 'v', 'g': 'ɡ', 'b': 'b', + 'z': 'z', 'ž': 'ʒ', 'd': 'd', 'ď': 'ɟ', 'h': 'ɦ', + 'ch': 'ɣ', 'x': 'ks', 'w': 'v', 'q': 'kv'} + + voice_voiceless = {'dz': 't͡s', 'dž': 't͡ʃ', 'v': 'f', 'g': 'k', 'b': 'p', + 'z': 's', 'ž': 'ʃ', 'd': 't', 'ď': 'c', 'h': 'x', + 'ch': 'x', 'x': 'ks', 'w': 'f', 'q': 'kf'} + + voiceless_voiceless = {'c': 't͡s', 'č': 't͡ʃ', 'f': 'f', 'k': 'k', + 'p': 'p', 's': 's', 'š': 'ʃ', 't': 't', 'ť': 'c'} + + voiceless_voice = {'c': 'd͡z', 'č': 'd͡ʒ', 'f': 'v', 'k': 'ɡ', 'p': 'b', + 's': 'z', 'š': 'ʒ', 't': 'd', 'ť': 'ɟ'} + + # exceptions + vowel_prefixes = ('nade', 'obe', 'pode', 'roze', 'se', 've', 'ná', + 'vze', 'ze', 'ne', 'vele', 'ante', 'de', 'pre', 're', + 'vice', 'na', 'za', 'leda', 'pa', 'pra', 'sotva', 'ana', + 'dia', 'extra', 'hepta', 'hexa', 'infra', 'intra', + 'kontra', 'meta', 'para', 'supra', 'tetra', 'ultra', + 'mimo', 'okolo', 'polo', 'skoro', 'alo', 'hetero', + 'homo', 'hypo', 'iso', 'kvadro', 'makro', 'mezzo', + 'mikro', 'proto', 'pseudo', 'retro', 'mono', 'mili', + 'kilo', 'zá') + + # TODO: foreign words + + # split on clauses + text = text.replace('...', '.') + parts = re.split(r'[,;\.\!\?\"\-\–$]', text) + delimiters = [l for l in text if l in ',;.!?"-–'] + + # transcript clauses + transcripted_parts = list() + for part in parts: + # check input + if not part: + transcripted_parts.append('') + continue + + # prepare text to list of letters to transcript + part = part.lower().strip() + part = part.replace('ch', 'A').replace('dz', 'B').replace('dž', 'C') + digraphs = {'A': 'ch', 'B': 'dz', 'C': 'dž'} + part = list(part) + for l in range(len(part)): + if part[l] in digraphs: + part[l] = digraphs[part[l]] + + # transcripted input + ipa = [l for l in part] + + # find out intervals for neutralization and assimilation + posit_vowel = [-1] + [i for i in range(len(part)) if part[i] in vowels] + posit_sonor = [i for i in range(len(part)) if part[i] in sonors] + + # neutralization + j = posit_vowel[-1] + if posit_sonor and posit_sonor[-1] > posit_vowel[-1]: + j = posit_sonor[-1] + + i = len(part) - 1 + while i > j: + if part[i] in voice_voiceless: + ipa[i] = voice_voiceless[part[i]] + elif part[i] in voiceless_voiceless: + ipa[i] = voiceless_voiceless[part[i]] + elif part[i] in sonors: + ipa[i] = sonors[part[i]] + i -= 1 + + # transctiption and assimilation + while posit_vowel: + i, k = j, j + j = posit_vowel.pop() + voice = None # assimil. type (N=uknown, T=voice, F=voiceless) + while i > j: + # transcription of vowels + if part[i] in vowels: + # diphtongs ou, eu, au + if part[i] in 'aeo' and len(part) > i+1 \ + and part[i+1] == 'u': + test = [True if p == ''.join(part[i+1-len(p):i+1]) + else False + for p in vowel_prefixes] + if any(test): + ipa[i] = vowels[part[i]] + ' ʔ' + else: + ipa[i] = vowels[part[i]] + 'u̯' + ipa[i+1] = '' + # diphtongs ia, ie, iu + elif (part[i] == 'i' and i < len(part) - 1 and + part[i+1] in 'aeu'): + test = [True if p == ''.join(part[i+1-len(p):i+1]) + else False + for p in vowel_prefixes] + if any(test): + ipa[i] = vowels[part[i]] + ' ʔ' + else: + ipa[i] = 'i̯' + ipa[i+1] + ipa[i+1] = '' + # otherwise + else: + ipa[i] = vowels[part[i]] + # initial of word (glotal plosive) + if i == 0 or part[i-1] == ' ' and part[i-2] in vowels: + ipa[i] = 'ʔ ' + ipa[i] + + # transcription of sonors and consonants + elif k != i: + # sonors + if part[i] in sonors: + voice = None + # nn + if part[i] == 'n' and part[i+1] == 'n': + ipa[i] = '' + # nk, ng + elif part[i] == 'n' and part[i+1] in 'kg': + ipa[i] = 'ŋ' + # ni, ní, ne, nie, niu, nia + elif part[i] == 'n' and part[i+1] in 'eií': + ipa[i] = 'ɲ' + # li, lí, le, lie, liu, lia + elif part[i] == 'l' and part[i+1] in 'eií': + ipa[i] = 'ʎ' + # mv, mf + elif part[i] == 'm' and part[i+1] in 'vf': + ipa[i] = 'ɱ' + # otherwise + else: + ipa[i] = sonors[part[i]] + # kk + elif part[i] == 'k' and part[i+1] == 'k': + ipa[i] = '' + # choose type of assimilation + elif voice is None: + # voiced + if part[i] in voice_voice: + voice = True + # v + if part[i] == 'v': + voice = None + # di, dí, de, dia, die, diu + elif part[i] == 'd' and part[i+1] in 'iíe': + ipa[i] = 'ɟ' + # otherwise + else: + ipa[i] = voice_voice[part[i]] + # voiceless + elif part[i] in voiceless_voiceless: + voice = False + # ti, tí, te, tia, tie, tiu + if part[i] == 't' and part[i+1] in 'iíe': + ipa[i] = 'c' + # otherwise + else: + ipa[i] = voiceless_voiceless[part[i]] + # assimilation + else: + # voiced group + if voice is True and part[i] in voice_voice: + ipa[i] = voice_voice[part[i]] + elif voice is True and part[i] in voiceless_voice: + ipa[i] = voiceless_voice[part[i]] + # voiceless group + elif voice is False and part[i] in voice_voiceless: + ipa[i] = voice_voiceless[part[i]] + elif voice is False and part[i] in voiceless_voiceless: + ipa[i] = voiceless_voiceless[part[i]] + + i -= 1 + + # clean empty cells and save transcripted clauses + ipa = list(filter(None, ipa)) + transcripted_parts.append(ipa) + + # return transcripted text + transcripted_parts = [' '.join(part) for part in transcripted_parts] + transcripted = '' + i = 0 + while i < len(delimiters): + transcripted += transcripted_parts[i] + delimiters[i] + i += 1 + if i < len(transcripted_parts): + transcripted += transcripted_parts[-1] + + transcripted = re.sub(r'\.|\?|\!|\;|\"', ' || ', transcripted) + transcripted = re.sub(r'\,|\-|\–', ' | ', transcripted) + return transcripted + + +# running script if it is used in shell (with stdin or path to file) +if __name__ == '__main__': + + if not sys.stdin.isatty(): # read from stdin + for line in sys.stdin: + print(ipa_slovak(line.strip()), sep='\t') + + else: # read from file + if len(sys.argv) == 2: + with open(sys.argv[1], mode='r', encoding='utf-8') as f: + for line in f: + print(ipa_slovak(line.strip()), sep='\t') + else: + print('Error: Use script in pipeline or give the path ' + 'to the relevant file in the first argument.')