Source code for cltk.phonology.lat.syllabifier

"""Split Latin words into a list of syllables, based on a set of Latin
language syllable specifications and the original work of Father Matthew
Spencer in C# and Javascript. Original documentation from Fr. Spencer
is preserved where applicable.
"""

import re
from typing import List

__author__ = ["Luke Hollis <lukehollis@gmail.com>"]
__license__ = "MIT License. See LICENSE."

# nota bene: ui is only a diphthong in the exceptional
# cases below (according to Wheelock's Latin)
LATIN = {
    "diphthongs": ["ae", "au", "ei", "eu", "oe"],
    "exceptions": {
        "huius": ["hui", "us"],
        "cuius": ["cui", "us"],
        "huic": ["huic"],
        "cui": ["cui"],
        "hui": ["hui"],
    },
    # y is treated as a vowel; not native to Latin but useful
    # for words borrowed from Greek
    "vowels": [
        "a",
        "e",
        "i",
        "o",
        "u",
        "á",
        "é",
        "í",
        "ó",
        "ú",
        "ā",
        "ē",
        "ī",
        "ō",
        "ū",
        "æ",
        "œ",
        "ǽ",  # no accented œ in unicode?
        "y",
    ],
    "mute_consonants_and_f": ["b", "c", "d", "g", "p", "t", "f"],
    "liquid_consonants": ["l", "r"],
    "prefixes": [
        "a",
        "ab",
        "abs",
        "ad",
        "ac",
        "amb",
        "ambi",
        "ante",
        "circum",
        "co",
        "con",
        "com",
        "contra",
        "counter",
        "de",
        "dis",
        "di",
        "dif",
        "e",
        "ex",
        "ef",
        "extra",
        "extro",
        "in",
        "en",
        "infra",
        "inter",
        "intro",
        "juxta",
        "ne",
        "non",
        "ob",
        "per",
        "post",
        "prae",
        "pre",
        "preter",
        "pro",
        "quasi",
        "re",
        "red",
        "retro",
        "se",
        "sed",
        "sin",
        "sine",
        "sub",
        "subter",
        "super",
        "sur",
        "supra",
        "trans",
        "tra",
        "tran",
        "ultra",
        "outr",
    ],
    "single_syllable_prefixes": ["in", "ex", "ob"],
}


[docs]def _is_consonant(char: str) -> bool: """Checks if char is in the list of vowels in the language""" return char not in LATIN["vowels"]
[docs]def _is_vowel(char: str) -> bool: """Checks if char is in the list of vowels in the language""" return char in LATIN["vowels"]
[docs]def _is_diphthong(char_1: str, char_2: str) -> bool: """Checks if two sequential characters compose a diphthong""" return char_1 + char_2 in LATIN["diphthongs"]
[docs]def _is_mute_consonant_or_f(char: str) -> bool: """Checks if char is in the mute_consonants_and_f list""" return char in LATIN["mute_consonants_and_f"]
[docs]def _is_liquid_consonant(char: str) -> bool: """Checks if char is in the mute_consonants_and_f list""" return char in LATIN["liquid_consonants"]
[docs]def syllabify(word: str) -> List[str]: """ Splits input Latin word into a list of syllables >>> syllabify('sidere') ['si', 'de', 're'] """ prefixes = LATIN["single_syllable_prefixes"] prefixes.sort(key=len, reverse=True) # Check if word is in exception dictionary if word in LATIN["exceptions"]: syllables = LATIN["exceptions"][word] # Else, break down syllables for word else: syllables = [] # Remove prefixes for prefix in prefixes: if word.startswith(prefix): syllables.append(prefix) word = re.sub("^%s" % prefix, "", word) break # Initialize syllable to build by iterating through over characters syllable = "" # Get word length for determining character position in word word_len = len(word) # Iterate over characters to build syllables for i, char in enumerate(word): # Build syllable syllable = syllable + char syllable_complete = False # Checks to process syllable logic char_is_vowel = _is_vowel(char) has_next_char = i < word_len - 1 has_prev_char = i > 0 # If it's the end of the word, the syllable is complete if not has_next_char: syllable_complete = True else: next_char = word[i + 1] if has_prev_char: prev_char = word[i - 1] # 'i' is a special case for a vowel. when i is at the # beginning of the word (Iesu) or i is between # vowels (alleluia), then the i is treated as a # consonant (y) Note: what about compounds like 'adiungere' if char == "i" and has_next_char and _is_vowel(next_char): if i == 0: char_is_vowel = False elif _is_vowel(prev_char): char_is_vowel = False # Determine if the syllable is complete if char_is_vowel: if ( # If the next character's a vowel _is_vowel( next_char ) # And it doesn't compose a dipthong with the current character and not _is_diphthong( char, next_char ) # And the current character isn't preceded by a q, unless followed by a u and not ( has_prev_char and prev_char == "q" and char == "u" and next_char != "u" ) ) or ( # If the next character's a consonant # but not a double consonant, # unless it's a mute consonant followed # by a liquid consonant i < word_len - 2 and ( ( ( has_prev_char and prev_char != "q" and char == "u" and _is_vowel(word[i + 2]) ) or ( not has_prev_char and char == "u" and _is_vowel(word[i + 2]) ) ) or ( char != "u" and _is_vowel(word[i + 2]) and not _is_diphthong(char, next_char) ) or ( _is_mute_consonant_or_f(next_char) and _is_liquid_consonant(word[i + 2]) ) ) ): syllable_complete = True # Otherwise, it's a consonant else: if ( # If the next character's also a consonant (but it's not the last in the word) ( not _is_vowel(next_char) and i < word_len - 2 ) # If the char's not a mute consonant followed by a liquid consonant and not ( _is_mute_consonant_or_f(char) and _is_liquid_consonant(next_char) ) # If the char's not a c, p, or t followed by an h and not ( ( has_prev_char and not _is_vowel(prev_char) and char in ["c", "p", "t"] and next_char == "h" ) or ( not has_prev_char and char in ["c", "p", "t"] and next_char == "h" ) ) # And it's not the only letter in the syllable and not len(syllable) == 1 ): syllable_complete = True # If it's a complete syllable, append it to syllables list and reset syllable if syllable_complete: syllables.append(syllable) syllable = "" return syllables