Source code for cltk.prosody.lat.scanner

"""
Scansion module for scanning Latin prose rhythms.
"""
from typing import Dict, List

from cltk.prosody.lat.syllabifier import Syllabifier


[docs]class Scansion: """ Prepossesses Latin text for prose rhythm analysis. """ SHORT_VOWELS = ["a", "e", "i", "o", "u", "y"] LONG_VOWELS = ["ā", "ē", "ī", "ō", "ū"] VOWELS = SHORT_VOWELS + LONG_VOWELS DIPHTHONGS = ["ae", "au", "ei", "oe", "ui"] SINGLE_CONSONANTS = [ "b", "c", "d", "g", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "f", "j", ] DOUBLE_CONSONANTS = ["x", "z"] CONSONANTS = SINGLE_CONSONANTS + DOUBLE_CONSONANTS DIGRAPHS = ["ch", "ph", "th", "qu"] LIQUIDS = ["r", "l"] MUTES = ["b", "p", "d", "t", "c", "g"] MUTE_LIQUID_EXCEPTIONS = ["gl", "bl"] NASALS = ["m", "n"] SESTS = ["sc", "sm", "sp", "st", "z"] def __init__(self, punctuation=None, clausula_length=13, elide=True): if punctuation is None: punctuation = [".", "?", "!", ";", ":"] self.punctuation = punctuation self.clausula_length = clausula_length self.elide = elide self.syllabifier = Syllabifier()
[docs] def _tokenize_syllables(self, word: str) -> List[Dict]: """ Tokenize syllables for word. "mihi" -> [{"syllable": "mi", index: 0, ... } ... ] Syllable properties: syllable: string -> syllable index: int -> postion in word long_by_nature: bool -> is syllable long by nature accented: bool -> does receive accent long_by_position: bool -> is syllable long by position :param word: string :return: list >>> Scansion()._tokenize_syllables("mihi") [{'syllable': 'mi', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'hi', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}] >>> Scansion()._tokenize_syllables("ivi") [{'syllable': 'i', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'vi', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}] >>> Scansion()._tokenize_syllables("audītū") [{'syllable': 'au', 'index': 0, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': False}, {'syllable': 'dī', 'index': 1, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'tū', 'index': 2, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': False}] >>> Scansion()._tokenize_syllables("ā") [{'syllable': 'ā', 'index': 0, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': True}] >>> Scansion()._tokenize_syllables("conjiciō") [{'syllable': 'con', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': False}, {'syllable': 'ji', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'ci', 'index': 2, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}, {'syllable': 'ō', 'index': 3, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': False}] >>> Scansion()._tokenize_syllables("lingua") [{'syllable': 'lin', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}, {'syllable': 'gua', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}] >>> Scansion()._tokenize_syllables("abrante") [{'syllable': 'ab', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, 'mute+liquid'), 'accented': False}, {'syllable': 'ran', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}, {'syllable': 'te', 'index': 2, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}] >>> Scansion()._tokenize_syllables("redemptor") [{'syllable': 'red', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}, {'syllable': 'em', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}, {'syllable': 'ptor', 'index': 2, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}] >>> Scansion()._tokenize_syllables("nagrante") [{'syllable': 'na', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, 'mute+liquid'), 'accented': False}, {'syllable': 'gran', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}, {'syllable': 'te', 'index': 2, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}] """ syllable_tokens = [] syllables = self.syllabifier.syllabify(word) longs = self.LONG_VOWELS + self.DIPHTHONGS for i, _ in enumerate(syllables): # basic properties syllable_dict = { "syllable": syllables[i], "index": i, "elide": (False, None), } # is long by nature if any(long in syllables[i] for long in longs): if syllables[i][:3] != "qui": syllable_dict["long_by_nature"] = True else: syllable_dict["long_by_nature"] = False else: syllable_dict["long_by_nature"] = False # long by position intra word if ( i < len(syllables) - 1 and syllable_dict["syllable"][-1] in self.CONSONANTS ): if ( syllable_dict["syllable"][-1] in self.MUTES and syllables[i + 1][0] in self.LIQUIDS and syllable_dict["syllable"][-1] + syllables[i + 1][0] not in self.MUTE_LIQUID_EXCEPTIONS ): syllable_dict["long_by_position"] = (False, "mute+liquid") elif ( syllable_dict["syllable"][-1] in self.DOUBLE_CONSONANTS or syllables[i + 1][0] in self.CONSONANTS ): syllable_dict["long_by_position"] = (True, None) else: syllable_dict["long_by_position"] = (False, None) elif ( i < len(syllables) - 1 and syllable_dict["syllable"][-1] in self.VOWELS and len(syllables[i + 1]) > 1 ): if ( syllables[i + 1][0] in self.MUTES and syllables[i + 1][1] in self.LIQUIDS and syllables[i + 1][0] + syllables[i + 1][1] not in self.MUTE_LIQUID_EXCEPTIONS ): syllable_dict["long_by_position"] = (False, "mute+liquid") elif ( syllables[i + 1][0] in self.CONSONANTS and syllables[i + 1][1] in self.CONSONANTS or syllables[i + 1][0] in self.DOUBLE_CONSONANTS ): syllable_dict["long_by_position"] = (True, None) else: syllable_dict["long_by_position"] = (False, None) elif ( len(syllable_dict["syllable"]) > 2 and syllable_dict["syllable"][-1] in self.CONSONANTS and syllable_dict["syllable"][-2] in self.CONSONANTS and syllable_dict["syllable"][-3] in self.VOWELS ): syllable_dict["long_by_position"] = (True, None) else: syllable_dict["long_by_position"] = (False, None) syllable_tokens.append(syllable_dict) # is accented if len(syllables) > 2 and i == len(syllables) - 2: if ( syllable_dict["long_by_nature"] or syllable_dict["long_by_position"][0] ): syllable_dict["accented"] = True else: syllable_tokens[i - 1]["accented"] = True elif len(syllables) == 2 and i == 0 or len(syllables) == 1: syllable_dict["accented"] = True syllable_dict["accented"] = ( False if "accented" not in syllable_dict else True ) return syllable_tokens
[docs] def _tokenize_words(self, sentence: str) -> List[Dict]: """ Tokenize words for sentence. "Puella bona est" -> [{word: puella, index: 0, ... }, ... ] Word properties: word: string -> word index: int -> position in sentence syllables: list -> list of syllable objects syllables_count: int -> number of syllables in word :param sentence: string :return: list >>> Scansion()._tokenize_words('dedērunt te miror antōnī quorum.') [{'word': 'dedērunt', 'index': 0, 'syllables': [{'syllable': 'de', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}, {'syllable': 'dē', 'index': 1, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'runt', 'index': 2, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': False}], 'syllables_count': 3}, {'word': 'te', 'index': 1, 'syllables': [{'syllable': 'te', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}], 'syllables_count': 1}, {'word': 'miror', 'index': 2, 'syllables': [{'syllable': 'mi', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'ror', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 2}, {'word': 'antōnī', 'index': 3, 'syllables': [{'syllable': 'an', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': False}, {'syllable': 'tō', 'index': 1, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'nī', 'index': 2, 'elide': (False, None), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 3}, {'word': 'quorum.', 'index': 4, 'syllables': [{'syllable': 'quo', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'rum', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 2}] >>> Scansion()._tokenize_words('a spes co i no xe cta.') [{'word': 'a', 'index': 0, 'syllables': [{'syllable': 'a', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, 'sest'), 'accented': True}], 'syllables_count': 1}, {'word': 'spes', 'index': 1, 'syllables': [{'syllable': 'spes', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}], 'syllables_count': 1}, {'word': 'co', 'index': 2, 'syllables': [{'syllable': 'co', 'index': 0, 'elide': (True, 'weak'), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}], 'syllables_count': 1}, {'word': 'i', 'index': 3, 'syllables': [{'syllable': 'i', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}], 'syllables_count': 1}, {'word': 'no', 'index': 4, 'syllables': [{'syllable': 'no', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}], 'syllables_count': 1}, {'word': 'xe', 'index': 5, 'syllables': [{'syllable': 'xe', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}], 'syllables_count': 1}, {'word': 'cta.', 'index': 6, 'syllables': [{'syllable': 'cta', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}], 'syllables_count': 1}] >>> Scansion()._tokenize_words('x') [] >>> Scansion()._tokenize_words('atae amo.') [{'word': 'atae', 'index': 0, 'syllables': [{'syllable': 'a', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'tae', 'index': 1, 'elide': (True, 'strong'), 'long_by_nature': True, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 2}, {'word': 'amo.', 'index': 1, 'syllables': [{'syllable': 'a', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'mo', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 2}] >>> Scansion()._tokenize_words('bar rid.') [{'word': 'bar', 'index': 0, 'syllables': [{'syllable': 'bar', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}], 'syllables_count': 1}, {'word': 'rid.', 'index': 1, 'syllables': [{'syllable': 'rid', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}], 'syllables_count': 1}] >>> Scansion()._tokenize_words('ba brid.') [{'word': 'ba', 'index': 0, 'syllables': [{'syllable': 'ba', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, 'mute+liquid'), 'accented': True}], 'syllables_count': 1}, {'word': 'brid.', 'index': 1, 'syllables': [{'syllable': 'brid', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}], 'syllables_count': 1}] """ tokens = [] split_sent = [word for word in sentence.split(" ") if word != ""] for i, word in enumerate(split_sent): if len(word) == 1 and word not in self.VOWELS: break # basic properties word_dict = {"word": split_sent[i], "index": i} # syllables and syllables count word_dict["syllables"] = self._tokenize_syllables(split_sent[i]) word_dict["syllables_count"] = len(word_dict["syllables"]) if ( i != 0 and word_dict["syllables"][0]["syllable"][0] in self.VOWELS or i != 0 and word_dict["syllables"][0]["syllable"][0] == "h" ): last_syll_prev_word = tokens[i - 1]["syllables"][-1] if ( last_syll_prev_word["syllable"][-1] in self.LONG_VOWELS or last_syll_prev_word["syllable"][-1] == "m" ): last_syll_prev_word["elide"] = (True, "strong") elif ( len(last_syll_prev_word["syllable"]) > 1 and last_syll_prev_word["syllable"][-2:] in self.DIPHTHONGS ): last_syll_prev_word["elide"] = (True, "strong") elif last_syll_prev_word["syllable"][-1] in self.SHORT_VOWELS: last_syll_prev_word["elide"] = (True, "weak") # long by position inter word if ( i > 0 and tokens[i - 1]["syllables"][-1]["syllable"][-1] in self.CONSONANTS and word_dict["syllables"][0]["syllable"][0] in self.CONSONANTS ): # previous word ends in consonant and current word begins with # consonant tokens[i - 1]["syllables"][-1]["long_by_position"] = (True, None) elif ( i > 0 and tokens[i - 1]["syllables"][-1]["syllable"][-1] in self.VOWELS and word_dict["syllables"][0]["syllable"][0] in self.CONSONANTS ): # previous word ends in vowel and current word begins in # consonant if any( sest in word_dict["syllables"][0]["syllable"] for sest in self.SESTS ): # current word begins with sest tokens[i - 1]["syllables"][-1]["long_by_position"] = (False, "sest") elif ( word_dict["syllables"][0]["syllable"][0] in self.MUTES and word_dict["syllables"][0]["syllable"][1] in self.LIQUIDS ): # current word begins with mute + liquid tokens[i - 1]["syllables"][-1]["long_by_position"] = ( False, "mute+liquid", ) elif ( word_dict["syllables"][0]["syllable"][0] in self.DOUBLE_CONSONANTS or word_dict["syllables"][0]["syllable"][1] in self.CONSONANTS ): # current word begins 2 consonants tokens[i - 1]["syllables"][-1]["long_by_position"] = (True, None) tokens.append(word_dict) return tokens
[docs] def tokenize(self, text: str) -> List[Dict]: """ Tokenize text on supplied characters. "Puella bona est. Puer malus est." -> [ [{word: puella, syllables: [...], index: 0}, ... ], ... ] :return:list >>> Scansion().tokenize('puella bona est. puer malus est.') [{'plain_text_sentence': 'puella bona est', 'structured_sentence': [{'word': 'puella', 'index': 0, 'syllables': [{'syllable': 'pu', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}, {'syllable': 'el', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}, {'syllable': 'la', 'index': 2, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 3}, {'word': 'bona', 'index': 1, 'syllables': [{'syllable': 'bo', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'na', 'index': 1, 'elide': (True, 'weak'), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 2}, {'word': 'est', 'index': 2, 'syllables': [{'syllable': 'est', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}], 'syllables_count': 1}]}, {'plain_text_sentence': ' puer malus est', 'structured_sentence': [{'word': 'puer', 'index': 0, 'syllables': [{'syllable': 'pu', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'er', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': False}], 'syllables_count': 2}, {'word': 'malus', 'index': 1, 'syllables': [{'syllable': 'ma', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': True}, {'syllable': 'lus', 'index': 1, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (False, None), 'accented': False}], 'syllables_count': 2}, {'word': 'est', 'index': 2, 'syllables': [{'syllable': 'est', 'index': 0, 'elide': (False, None), 'long_by_nature': False, 'long_by_position': (True, None), 'accented': True}], 'syllables_count': 1}]}, {'plain_text_sentence': '', 'structured_sentence': []}] """ tokenized_sentences = text.split(".") tokenized_text = [] for sentence in tokenized_sentences: sentence_dict = {} sentence_dict["plain_text_sentence"] = sentence sentence_dict["structured_sentence"] = self._tokenize_words(sentence) tokenized_text.append(sentence_dict) return tokenized_text
[docs] def scan_text(self, text: str) -> List[str]: """ Return a flat list of rhythms. Desired clausula length is passed as a parameter. Clausula shorter than the specified length can be exluded. :return: >>> Scansion().scan_text('dedērunt te miror antōnī quorum. sī quid est in mē ingenī jūdicēs quod sentiō.') ['u--uuu---ux', 'u---u--u---ux'] """ tokens = self.tokenize(text) clausulae = [] for sentence in tokens: sentence_clausula = [] syllables = [word["syllables"] for word in sentence["structured_sentence"]] flat_syllables = [syllable for word in syllables for syllable in word] if self.elide: flat_syllables = [ syll for syll in flat_syllables if not syll["elide"][0] ][:-1][::-1] for syllable in flat_syllables: if len(sentence_clausula) < self.clausula_length - 1: if syllable["long_by_nature"] or syllable["long_by_position"][0]: sentence_clausula.append("-") else: sentence_clausula.append("u") sentence_clausula = sentence_clausula[::-1] sentence_clausula.append("x") clausulae.append("".join(sentence_clausula)) clausulae = clausulae[:-1] return clausulae