"""Convert a word from Greek orthography into its hypothesized
pronunciation in the International Phonetic Alphabet (IPA).
https://raw.githubusercontent.com/j-duff/cltk/ipa/
cltk/phonology/greek/transcription.py
"""
import re
import unicodedata
from typing import List
from nltk.tokenize import wordpunct_tokenize
from cltk.core.cltk_logger import logger
try:
# James Tauber's greek_accentuation package
from greek_accentuation import characters as chars
except ImportError as import_error:
message = (
'Missing "greek_accentuation" package. Install with '
"`pip install greek-accentuation`."
)
logger.error(message)
logger.error(import_error)
raise
__author__ = ["Jack Duff <jmunroeduff@gmail.com>"]
__license__ = "MIT License. See LICENSE."
# Dictionaries of phonological reconstructions for use in transcribing.
# Probert, Philomen. 2010. Phonology, in E. Bakker, A Companion to the \
# Ancient Greek Language.
# (Entries which are commented out are realized through diacritic analysis.)
GREEK = {
"Attic": {
"Probert": {
"correspondence": {
"α": "ɑ",
# 'ᾱ' : 'ɑː',
"β": "b",
"γ": "g",
"δ": "d",
"ε": "e",
"ζ": "sd",
"η": "ɛː",
"θ": "tʰ",
"ι": "i",
# 'ῑ' : 'iː',
"κ": "k",
"λ": "l",
"μ": "m",
"ν": "n",
"ξ": "ks",
"ο": "o",
"π": "p",
"ρ": "r",
"σ": "s",
"ς": "s",
"τ": "t",
"υ": "y",
# 'ῡ' : 'yː',
"φ": "pʰ",
"χ": "kʰ",
"ψ": "ps",
"ω": "ɔː",
"αι": "ɑj",
# 'ᾱι' : 'ɑːj',
# 'ᾳ' : 'ɑːj',
"ει": "ẹː",
"ηι": "ɛːj",
# 'ῃ' : 'ɛːj',
"οι": "oj",
"ωι": "ɔːj",
# 'ῳ' : 'ɔːj',
"υι": "yj",
"αυ": "ɑw",
"ευ": "ew",
"ηυ": "ɛːw",
"ου": "ọː",
},
"diphthongs": [
# and digraphs
"αι",
"ει",
"ηι",
"οι",
"ωι",
"υι",
"αυ",
"ευ",
"ηυ",
"ου",
],
"punctuation": [
".",
",",
"·",
"•",
":",
"!",
";",
"(",
")",
"[",
"]",
"'",
"᾽",
'"',
],
# Should the h value generated by the diacritic analysis
# be moved to the front of possible diphthongs?
"front_h": True,
# Should iota subscripts be pronounced as diphthongs
# or simply ignored?
# (dependent on Greek reconstruction)
"pronounce_iota_sub": True,
# What contextual pronunciation rules apply?
"alternations": [
"r_devoice",
"s_voice_assimilation",
"nasal_place_assimilation",
"g_nasality_assimilation",
],
}
}
}
# All IPA characters used sorted by natural classes.
# WILL NEED ADDITIONS AS MORE RECONSTRUCTIONS USED
IPA = {
"voiced": ["b", "d", "g", "m", "n", "ŋ" "l", "r", "z"], # [+voice]
"labial": ["b", "p", "pʰ", "m"], # [+labial]
"coronal": ["d", "t", "tʰ", "n", "s", "z", "r", "r̥", "l"], # [+coronal]
"velar": ["g", "k", "kʰ", "ŋ"], # [+velar]
"nasal": ["m", "n", "ŋ"], # [+nasal]
"approximant": ["l", "r", "r̥", "j", "w"], # [+approximant]
"continuant": ["h", "s", "z", "l", "r", "r̥"], # [+continuant, +consonantal]
"vowel": [ # [-consonantal -approximant]
"ɑ",
"ɑː",
"e",
"ẹː",
"ɛː",
"i",
"iː",
"o",
"ọː",
"y",
"yː",
"ɔː",
"ɑj", # diphthongs
"ɛːj",
"ɛːj",
"oj",
"ɔːj",
"ɔːj",
"yj",
"ɑw",
"ew",
"ɛːw",
],
"high": ["i", "iː", "y", "yː"], # [-consonantal, +high]
"low": ["ɑ", "ɛː", "ɔː", "ɑː"], # [-consonantal, +low]
"front": ["i", "iː", "y", "yː", "e", "ẹː", "ɛː"], # [-consonantal, +front]
"back": ["o", "ọː", "ɑ", "ɑː", "ɔː"], # [-consonantal, +back]
"boundary": ["#"],
}
[docs]class Phone:
"""A phonological unit to be manipulated and represented as an IPA string."""
# Has a bundle of feature values that help classify it so that it can
# trigger contextual pronunciation changes.
def __init__(self, ipa_ch):
# Additions to greek_accentuation.characters for use in this class:
ipa_circumflex = "\u0302" # ˆ, the IPA tonal notation for ῀
tones = chars.extract_diacritic(chars.ACUTE, ipa_circumflex)
# Collects IPA tonal diacritics
clear_tones = chars.remove_diacritic(chars.ACUTE, ipa_circumflex)
# Clears IPA tonal diacritics
# eventually exported to output string
self.ipa = unicodedata.normalize("NFC", ipa_ch)
# without IPA diacritics
self.bare = unicodedata.normalize("NFC", clear_tones(ipa_ch))
# selects the IPA diacritics
self.tone = tones(ipa_ch)
# will be assigned once in Word, as the pre-context of this phone
self.left = ""
# .... as the post-context of this phone
self.right = ""
# bundle of features, stored as booleans:
self.vce = self.bare in IPA["voiced"]
self.lab = self.bare in IPA["labial"]
self.cor = self.bare in IPA["coronal"]
self.vel = self.bare in IPA["velar"]
self.nas = self.bare in IPA["nasal"]
self.app = self.bare in IPA["approximant"]
self.cont = self.bare in IPA["continuant"]
self.vow = self.bare in IPA["vowel"]
self.hi = self.bare in IPA["high"]
self.lo = self.bare in IPA["low"]
self.fr = self.bare in IPA["front"]
self.bk = self.bare in IPA["back"]
self.bound = self.bare in IPA["boundary"]
[docs]class Word:
"""Max. phonological unit, contains phones and triggers alternations."""
# An ordered collection of Phones,
# which are bundles of features/IPA strings.
def __init__(self, ipa_str: str, root):
"""
:param ipa_str:
:param root:
"""
self.string = unicodedata.normalize("NFC", ipa_str)
# Appropriate directory in the reconstruction dictionary
self.root = root
# list of contextual pronunciation alternations
self.alts = self.root["alternations"]
# Turns string of IPA characters into list of Phones
self.phones = [Phone(c) for c in re.findall(r".[̥́̂ʰ]?ː?[jw]?", self.string)]
[docs] def _refresh(self):
"""
Assigns left and right contexts for every phone
"""
for n in range(len(self.phones)):
p = self.phones[n]
if n != 0:
p.left = self.phones[n - 1]
else:
p.left = Phone("#")
if n != len(self.phones) - 1:
p.right = self.phones[n + 1]
else:
p.right = Phone("#")
[docs] def _r_devoice(self):
"""
Pronounce r as voiceless word-init or after another r.
"""
out_phones = self.phones
target = Phone("r̥")
for n in range(len(self.phones)):
p = self.phones[n]
if p.left.bound and p.ipa == "r":
out_phones[n] = target
if p.left.ipa == "r" and p.ipa == "r":
out_phones[n] = target
self.phones = out_phones
self._refresh()
[docs] def _s_voice_assimilation(self):
"""
Pronounce s as voiced (z) when followed by voiced.
"""
out_phones = self.phones
target = Phone("z")
for n in range(len(self.phones)):
p = self.phones[n]
if p.ipa == "s" and p.right.vce:
out_phones[n] = target
self.phones = out_phones
self._refresh()
[docs] def _nasal_place_assimilation(self):
"""
Pronounce nasals/g as velar nasals when followed by velar.
"""
out_phones = self.phones
target = Phone("ŋ")
for n in range(len(self.phones)):
p = self.phones[n]
if (p.nas or p.ipa == "g") and p.right.vel:
out_phones[n] = target
self.phones = out_phones
self._refresh()
[docs] def _g_nasality_assimilation(self):
"""
Pronounce g as (velar) nasal when followed by nasal.
"""
out_phones = self.phones
target = Phone("ŋ")
for n in range(len(self.phones)):
p = self.phones[n]
if p.ipa == "g" and p.right.nas:
out_phones[n] = target
self.phones = out_phones
self._refresh()
# list of all possible alternations
ALTERNATIONS = [
("r_devoice", _r_devoice),
("s_voice_assimilation", _s_voice_assimilation),
("nasal_place_assimilation", _nasal_place_assimilation),
("g_nasality_assimilation", _g_nasality_assimilation),
]
def _alternate(self):
self._refresh()
# runs all alternations
for a in Word.ALTERNATIONS:
if a[0] in self.alts:
a[1](self)
[docs] def syllabify(self) -> List[str]:
"""
:return: syllabified word
"""
nuclei = []
for n in range(len(self.phones)):
p = self.phones[n]
if p.vow:
nuclei.append(n)
# initialize syllables with a tuple for the first syllable
# where onset is everything before the first nucleus
# and coda remains unknown.
if nuclei:
syllables = [[self.phones[0 : nuclei[0]], [self.phones[nuclei[0]]], []]]
# continue for every nucleus, assuming that everything between
# the previous nucleus and it is the onset.
for x in range(len(nuclei) - 1):
i = nuclei[x + 1]
onset = self.phones[nuclei[x] + 1 : i]
nucleus = [self.phones[i]]
syllables.append([onset, nucleus, []])
# assume that everything after the final nucleus is final coda.
syllables[-1][2] = self.phones[nuclei[-1] + 1 :]
else:
syllables = [[self.phones]]
# now go through and check onset viability
for x in range(len(syllables) - 1):
onset = syllables[x + 1][0]
nucleus = syllables[x + 1][1]
coda = syllables[x + 1][2]
# trim all onsets greater than the maximum 2 phones
# removing extra phones from the left
# and appending them to the previous coda
if len(onset) > 2:
trim = onset[:-2]
del onset[:-2]
syllables[x][2] = trim
# once onset is 2 phones...
if len(onset) == 2:
# voiceless oral stop + nasal or liquid
# is a viable sequence, and passes
if (
(not onset[0].cont)
and (not onset[0].vce)
and (not onset[0].app)
and (not onset[0].nas)
and (onset[1].nas or onset[1].app)
):
break
# voiced stop + liquid is also a viable sequence, and passes
elif (not onset[0].cont) and onset[1].app:
break
# otherwise, onset must be right Phone only
# the left phone is appended to the previous coda
else:
trim = onset[0]
del onset[0]
syllables[x][2] += [trim]
self.syllables = syllables
return syllables
def _print_ipa(self, syllabify):
out = ""
# if syllabification flag is present
# print IPA by syllable segment
if syllabify:
syllables = self.syllabify()
# ultima is final syllable
ultima = syllables[-1]
for s in syllables:
for n in s:
for p in n:
out += p.ipa
# after every non-final syllable
# print IPA syll punctuation: '.'
if s != ultima:
out += "."
# otherwise print unsyllabified IPA
else:
for p in self.phones:
out += p.ipa
return out
[docs]class Transcriber:
"""Uses a reconstruction to transcribe a orthographic string into IPA."""
def __init__(self, dialect, reconstruction):
self.dialect = dialect
self.recon = reconstruction
self.root = GREEK[self.dialect][self.recon]
self.table = self.root["correspondence"]
self.diphs = self.root["diphthongs"]
self.punc = self.root["punctuation"]
self.h = self.root["front_h"]
self.i = self.root["pronounce_iota_sub"]
[docs] def _parse_diacritics(self, ch):
"""
:param ch: EG: input with base α -> α/ACCENT/ETC/ (where ETC includes diaeresis, iota subscripts, and macrons)
:return: a string with separated and organized diacritics for easier access later.
"""
# Additions to greek_accentuation.characters for use here:
marked_breathing = chars.extract_diacritic(chars.ROUGH)
# (Don't need SMOOTH for these purposes)
marked_accents = chars.extract_diacritic(chars.ACUTE, chars.CIRCUMFLEX)
# (Don't need GRAVE for these purposes)
marked_length = chars.extract_diacritic(chars.LONG)
# (Don't need SHORT for these purposes)
h = marked_breathing(ch)
acc = marked_accents(ch)
etc = [chars.diaeresis(ch), chars.iota_subscript(ch), marked_length(ch)]
out = chars.base(ch).lower() # Initialize out as base of character.
if h is not None and out != "ρ": # If any rough breathing, and not rho
out = "h///" + out # insert an h/// before the base.
# ('aspirated' rhos can be ignored,
# and dealt with seperately.)
out += "/" # Create 1st boundary
if acc is not None: # If any accent, place between 1st and 2nd boundary
out += acc
out += "/" # Create 2nd boundary
for c in [c for c in etc if c is not None]: # If any other diacritics,
out += c # place between second and final boundary
out += "/" # Create final boundary
return out
[docs] def _prep_text(self, text):
"""
Performs preparatory tasks grouping and reordering characters in order to make transcription formulaic.
:param text:
:return:
"""
string_in = "".join([self._parse_diacritics(ch) for ch in text])
diph1 = "".join(list(set([d[0] for d in self.diphs])))
# (list of all acceptable first chars in diphthongs)
diph2 = "".join(list(set([d[1] for d in self.diphs])))
# (list of all acceptable second chars in diphthongs)
if self.h:
# Locates acceptable diphthongs and treats them as single base
# Combines all diacritics accordingly
# Also finds any h's stranded in media diphthong (\3) and moves
# them to the left edge
pattern = (
r"([" + diph1 + r"])\/\/([̄]?\/)(h///)?([" + diph2 + r"]\/[́͂]?\/)\/"
)
diphshift = re.sub(pattern, r"\3\1\4\2", string_in)
else:
# Same as above, minus h-moving
pattern = r"([" + diph1 + r"])\/\/([̄]?\/)([" + diph2 + r"]\/[́͂]?\/)\/"
diphshift = re.sub(pattern, r"\1\3\2", string_in)
if self.i:
# Locates iota subscripts and treats as base + iota diphthongs
# Adds macron, since iota subscripts only appear on long vowels
# (and we need to use all clues to identify long vowels)
iotashift = re.sub(
r"([αηω])(\/[́͂]*\/[̄ ̈]*)ͅ([̄ ̈]*\/)", r"\1ι\2̄\3", diphshift
)
else:
# Same as above, but deletes iota entirely: only adds macrons
iotashift = re.sub(
r"([αηω])(\/[́͂]*\/[̄ ̈]*)ͅ([̄ ̈]*\/)", r"\1\2̄\3", diphshift
)
tup_out = re.findall(r"(..?)\/([́͂]*)\/([̄ ̈]*)\/", iotashift)
return tup_out
[docs] def transcribe(self, text, accentuate=True, syllabify=True):
"""
>>> blackwell_transcriber = Transcriber("Attic", "Probert")
>>> example = blackwell_transcriber.transcribe("δέκατον μὲν ἔτος τόδ᾽ ἐπεὶ Πριάμου μέγας ἀντίδικος, Μενέλαος ἄναξ ἠδ᾽ Ἀγαμέμνων, διθρόνου Διόθεν καὶ δισκήπτρου τιμῆς ὀχυρὸν ζεῦγος Ἀτρειδᾶν στόλον Ἀργείων χιλιοναύτην, τῆσδ᾽ ἀπὸ χώρας")
>>> example
'[dé.kɑ.ton men é.tos tód e.pẹː pri.ɑ́.mọː mé.gɑs ɑn.tí.di.kos me.né.lɑ.os ɑ́.nɑks ɛːd ɑ.gɑ.mém.nɔːn di.tʰró.nọː di.ó.tʰen kɑj dis.kɛ́ːp.trọː ti.mɛ̂ːs o.kʰy.ron zdêw.gos ɑ.trẹː.dɑ̂n stó.lon ɑr.gẹ́ː.ɔːn kʰi.li.o.nɑ́w.tɛːn tɛ̂ːzd ɑ.po kʰɔ́ː.rɑs]'
:param text: word-tokenized, stripped of non-diacritic punctuation, and diphthongs and diacritics are handled
:param accentuate: if True, result is accentuated
:param syllabify: if True, result if syllabified
:return: transcribed text
"""
# input is
inp = [
self._prep_text(w) for w in wordpunct_tokenize(text) if w not in self.punc
]
words = []
for w in inp:
out = ""
for c in w:
ipa = self.table.get(c[0], c[0])
# if there are macrons in the diacritics, adds the ipa
# notation for length (if it isn't there already)
if chars.LONG in c[2]:
if "ː" not in ipa:
ipa = ipa[0] + "ː" + ipa[1:]
if accentuate:
# adds proper IPA notation for accents
# if circumflex accent, adds appropriate
# ipa tone contour notation
if chars.CIRCUMFLEX in c[1]:
ipa = ipa[0] + "̂" + ipa[1:]
# if acute accent, adds appropriate
# ipa tone contour notation
if chars.ACUTE in c[1]:
if len(ipa) > 1:
ipa = ipa[0] + "́" + ipa[1:]
else:
ipa += "́"
out += ipa
transcription = Word(out, self.root)
transcription._alternate()
words.append(transcription)
# Encloses output in brackets, proper notation for surface form.
return "[" + " ".join([w._print_ipa(syllabify) for w in words]) + "]"