Source code for cltk.lexicon.processes
"""
Processes for dictionary lookup.
"""
from copy import deepcopy
from dataclasses import dataclass
from boltons.cacheutils import cachedproperty
from cltk.core.data_types import Doc, Process
from cltk.core.exceptions import CLTKException
from cltk.lexicon.lat import LatinLewisLexicon
from cltk.lexicon.non import OldNorseZoegaLexicon
__author__ = ["Clément Besnier <clem@clementbesnier.fr>"]
[docs]@dataclass
class LexiconProcess(Process):
"""To be inherited for each language's dictionary declarations.
Example: ``LexiconProcess`` -> ``LatinLexiconProcess``
>>> from cltk.lemmatize.processes import LemmatizationProcess
>>> from cltk.core.data_types import Process
>>> issubclass(LexiconProcess, Process)
True
"""
language: str = None
@cachedproperty
def algorithm(self):
if self.language == "lat":
lex_class = LatinLewisLexicon()
else:
raise CLTKException(f"No lookup algorithm for language '{self.language}'.")
return lex_class
[docs] def run(self, input_doc: Doc) -> Doc:
lookup_algo = self.algorithm
output_doc = deepcopy(input_doc)
for word in output_doc.words:
if self.language == "lat":
word.definition = lookup_algo.lookup(word.lemma)
elif self.language == "non":
word.definition = lookup_algo.lookup(word.string)
else:
raise CLTKException(
f"``LexiconProcess()`` not available for language '{self.language}' This should never happen."
)
return output_doc
[docs]class LatinLexiconProcess(LexiconProcess):
"""The default Latin dictionary lookup algorithm.
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.tokenizers import LatinTokenizationProcess
>>> from cltk.lemmatize.processes import LatinLemmatizationProcess
>>> from cltk.languages.utils import get_lang
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.nlp import NLP
>>> pipe = Pipeline(description="A custom Latin pipeline", \
processes=[LatinTokenizationProcess, LatinLemmatizationProcess, LatinLexiconProcess], \
language=get_lang("lat"))
>>> nlp = NLP(language='lat', custom_pipeline=pipe, suppress_banner=True)
>>> cltk_doc = nlp.analyze(text=get_example_text("lat"))
>>> [word.definition[:10] for word in cltk_doc.words][:5]
['', 'est\\n\\n\\n see', 'omnis e (o', '', 'in old in']
"""
description = "Dictionary lookup process for Latin"
language = "lat"
@cachedproperty
def algorithm(self):
return LatinLewisLexicon()
[docs]class OldNorseLexiconProcess(LexiconProcess):
"""The default Latin dictionary lookup algorithm.
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.tokenizers import OldNorseTokenizationProcess
>>> from cltk.languages.utils import get_lang
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.nlp import NLP
>>> pipe = Pipeline(description="A custom Old Norse pipeline", \
processes=[OldNorseTokenizationProcess, OldNorseLexiconProcess], \
language=get_lang("non"))
>>> nlp = NLP(language='non', custom_pipeline=pipe, suppress_banner=True)
>>> cltk_doc = nlp.analyze(text=get_example_text("non"))
#>>> [word.definition[:10] for word in cltk_doc.words][:5] # TODO check this
#['', '(-s, -ar),', '', 'adv.\n1) th', '']
"""
description = "Dictionary lookup process for Old Norse"
language = "non"
@cachedproperty
def algorithm(self):
return OldNorseZoegaLexicon()