Source code for cltk.phonology.syllabifier_processes

"""This module implements syllabification processes for several languages.
You may extend **SyllabificationProcess** and see pre-defined examples.
"""


from copy import deepcopy
from dataclasses import dataclass

from boltons.cacheutils import cachedproperty

from cltk.core.data_types import Doc, Process
from cltk.phonology.ang.phonology import OldEnglishSyllabifier
from cltk.phonology.enm.phonology import MiddleEnglishSyllabifier
from cltk.phonology.gmh.phonology import MiddleHighGermanSyllabifier
from cltk.phonology.lat.phonology import LatinSyllabifier
from cltk.phonology.non.phonology import OldNorseSyllabifier


[docs]@dataclass class SyllabificationProcess(Process): """This is the class to extend if you want to code your own syllabification process in the CLTK-style. """
[docs] def run(self, input_doc: Doc) -> Doc: syllabifier = self.algorithm output_doc = deepcopy(input_doc) for word in output_doc.words: word.syllables = syllabifier(word.string.lower()) return output_doc
[docs]class GreekSyllabificationProcess(SyllabificationProcess): """Syllabification ``Process`` for Ancient Greek. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers.processes import GreekTokenizationProcess >>> from cltk.text.processes import DefaultPunctuationRemovalProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk import NLP >>> a_pipeline = Pipeline(description="A custom Greek pipeline", processes=[GreekTokenizationProcess, DefaultPunctuationRemovalProcess, GreekSyllabificationProcess], language=get_lang("grc")) >>> nlp = NLP(language='grc', custom_pipeline=a_pipeline, suppress_banner=True) >>> text = get_example_text("grc") >>> cltk_doc = nlp(text) >>> [word.syllables for word in cltk_doc.words[:5]] [['ὅτι'], ['μὲν'], ['ὑμ', 'εῖς'], ['ὦ'], ['ἄν', 'δρ', 'ες']] """ description = "The default Latin Syllabification process" @cachedproperty def algorithm(self): return LatinSyllabifier()
[docs]class LatinSyllabificationProcess(SyllabificationProcess): """Syllabification ``Process`` for Latin. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers.processes import LatinTokenizationProcess >>> from cltk.text.processes import DefaultPunctuationRemovalProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk import NLP >>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess, DefaultPunctuationRemovalProcess, LatinSyllabificationProcess], language=get_lang("lat")) >>> nlp = NLP(language='lat', custom_pipeline=a_pipeline, suppress_banner=True) >>> text = get_example_text("lat") >>> cltk_doc = nlp(text) >>> [word.syllables for word in cltk_doc.words[:5]] [['gal', 'li', 'a'], ['est'], ['om', 'nis'], ['di', 'vi', 'sa'], ['in']] """ description = "The default Latin Syllabification process" @cachedproperty def algorithm(self): return LatinSyllabifier()
[docs]class MiddleEnglishSyllabificationProcess(SyllabificationProcess): """Syllabification ``Process`` for Middle English. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers.processes import MiddleEnglishTokenizationProcess >>> from cltk.text.processes import DefaultPunctuationRemovalProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Middle English pipeline", \ processes=[MiddleEnglishTokenizationProcess, DefaultPunctuationRemovalProcess, MiddleEnglishSyllabificationProcess], \ language=get_lang("enm")) >>> nlp = NLP(language='enm', custom_pipeline=pipe, suppress_banner=True) >>> text = get_example_text("enm").replace('\\n', ' ') >>> cltk_doc = nlp(text) >>> [word.syllables for word in cltk_doc.words[:5]] [['whi', 'lom'], ['as'], ['ol', 'de'], ['sto', 'ries'], ['tellen']] """ description = "The default Middle English Syllabification process" @cachedproperty def algorithm(self): return MiddleEnglishSyllabifier()
[docs]class MiddleHighGermanSyllabificationProcess(SyllabificationProcess): """Syllabification ``Process`` for Middle High German. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers.processes import MiddleHighGermanTokenizationProcess >>> from cltk.text.processes import DefaultPunctuationRemovalProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Middle High German pipeline", \ processes=[MiddleHighGermanTokenizationProcess, DefaultPunctuationRemovalProcess, \ MiddleHighGermanSyllabificationProcess], language=get_lang("gmh")) >>> nlp = NLP(language='gmh', custom_pipeline=pipe, suppress_banner=True) >>> text = get_example_text("gmh") >>> cltk_doc = nlp(text) >>> [word.syllables for word in cltk_doc.words[:5]] [['uns'], ['ist'], ['in'], ['al', 'ten'], ['mæ', 'ren']] """ description = "The default Middle High German syllabification process" @cachedproperty def algorithm(self): return MiddleHighGermanSyllabifier()
[docs]class OldEnglishSyllabificationProcess(SyllabificationProcess): """Syllabification ``Process`` for Old English. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers.processes import MiddleEnglishTokenizationProcess >>> from cltk.text.processes import DefaultPunctuationRemovalProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Old English pipeline", \ processes=[MiddleEnglishTokenizationProcess, DefaultPunctuationRemovalProcess, OldEnglishSyllabificationProcess], \ language=get_lang("ang")) >>> nlp = NLP(language='ang', custom_pipeline=pipe, suppress_banner=True) >>> text = get_example_text("ang") >>> cltk_doc = nlp(text) >>> [word.syllables for word in cltk_doc.words[:5]] [['hwæt'], ['we'], ['gar', 'den', 'a'], ['in'], ['gear', 'da', 'gum']] """ description = "The default Old English syllabification process" @cachedproperty def algorithm(self): return OldEnglishSyllabifier()
[docs]class OldNorseSyllabificationProcess(SyllabificationProcess): """Syllabification ``Process`` for Old Norse. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers.processes import OldNorseTokenizationProcess >>> from cltk.text.processes import OldNorsePunctuationRemovalProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Old Norse pipeline", \ processes=[OldNorseTokenizationProcess, OldNorsePunctuationRemovalProcess, OldNorseSyllabificationProcess], \ language=get_lang("non")) >>> nlp = NLP(language='non', custom_pipeline=pipe, suppress_banner=True) >>> text = get_example_text("non") >>> cltk_doc = nlp(text) >>> [word.syllables for word in cltk_doc.words[:5]] [['gyl', 'fi'], ['ko', 'nungr'], ['réð'], ['þar'], ['lön', 'dum']] """ description = "The default Old Norse syllabification process" @cachedproperty def algorithm(self): return OldNorseSyllabifier()