Source code for cltk.dependency.tree
"""A data structure for representing dependency tree graphs."""
__author__ = ["John Stewart <free-variation>"]
from typing import Dict, List, Union
from xml.etree.ElementTree import Element, ElementTree
from cltk.core.data_types import Doc, Process, Word
from cltk.core.exceptions import CLTKException
from cltk.morphology.universal_dependencies_features import (
NOMINAL_FEATURES,
OTHER_FEATURES,
VERBAL_FEATURES,
MorphosyntacticFeature,
)
ALL_POSSIBLE_FEATURES = NOMINAL_FEATURES + VERBAL_FEATURES + OTHER_FEATURES
[docs]class Form(Element):
"""For the word (ie, node) of a dependency tree and its attributes. Inherits
from the ``Element`` class of Python's ``xml.etree`` library.
>>> desc_form = Form('described')
>>> desc_form
described_0
>>> desc_form.set('Tense', 'Past')
>>> desc_form
described_0
>>> desc_form / 'VBN'
described_0/VBN
>>> desc_form.full_str()
'described_0 [Tense=Past,pos=VBN]'
"""
def __init__(self, form: str, form_id: int = 0) -> None:
"""Constructor for the Form class."""
Element.__init__(self, form, attrib={"form_id": str(form_id)})
def __truediv__(self, pos_tag: str) -> "Form":
"""Assigns the POS feature for current form. This is
done by overloading ``operator.truediv()`` (``a / b``) to
perform ``.set()`` upon and ``Element`` of the xml library.
>>> desc_form = Form('described')
>>> desc_form / 'VBN'
described_0/VBN
>>> import operator
>>> desc_form = Form('described')
>>> operator.truediv(desc_form, 'VBN')
described_0/VBN
"""
self.set("pos", pos_tag)
return self
def __rshift__(self, other: Union["Form", str]) -> "Dependency":
"""Create a dependency between this form as governor, to
the other as dependent. Adds the dependent to the children
of this form. This is done by overloading ``operator.rshift()``
(``a >> b``) to perform ``.append()`` upon ``Element`` of the xml
library. Returns ``Dependency`` xxx
>>> john = Form('John', 1) / 'NNP'
>>> john
John_1/NNP
>>> loves = Form('loves', 2) / 'VRB'
>>> loves
loves_2/VRB
>>> mary = Form('Mary', 3) / 'NNP'
>>> mary
Mary_3/NNP
"""
other = Form(other) if isinstance(other, str) else other
self.append(other)
return Dependency(self, other)
[docs] def get_dependencies(self, relation: str) -> List["Dependency"]:
"""Extract dependents of this form for the specified
dependency relation.
>>> john = Form('John', 1) / 'NNP'
>>> loves = Form('loves', 2) / 'VRB'
>>> mary = Form('Mary', 3) / 'NNP'
>>> loves >> john | 'subj'
subj(loves_2/VRB, John_1/NNP)
>>> loves >> mary | 'obj'
obj(loves_2/VRB, Mary_3/NNP)
>>> loves.get_dependencies('subj')
[subj(loves_2/VRB, John_1/NNP)]
>>> loves.get_dependencies('obj')
[obj(loves_2/VRB, Mary_3/NNP)]
"""
deps = self.findall('*[@relation="{}"]'.format(relation))
return [Dependency(self, dep, relation) for dep in deps]
def __str__(self) -> str:
return (
self.tag
+ "_"
+ self("form_id")
+ (("/" + self("pos")) if self("pos") else "")
)
__repr__ = __str__
[docs] def full_str(self, include_relation=True) -> str:
"""Returns a string containing all features of the Form.
The ID is attached to the text, and the relation is
optionally suppressed.
>>> loves = Form('loves', 2) / 'VRB'
>>> loves.full_str()
'loves_2 [pos=VRB]'
>>> john = Form('John', 1) / 'NNP'
>>> loves >> john | 'subj'
subj(loves_2/VRB, John_1/NNP)
>>> john.full_str(True)
'John_1 [pos=NNP,relation=subj]'
"""
excluded = ["form_id", "relation"] if not include_relation else ["form_id"]
return "{0}_{1} [{2}]".format(
self.tag,
self("form_id"),
",".join(
[
feature + "=" + self(feature)
for feature in self.attrib.keys()
if feature not in excluded
]
),
)
def __call__(self, feature: str) -> str:
return self.get(feature)
[docs] @staticmethod
def to_form(word: Word) -> "Form":
"""Converts a ``CLTK`` ``Word`` object to a ``Form``.
TODO: The Form info that prints is incomplete/ugly; correct str repr of ``Form``
TODO: Fix these doctests; it's ugly to import so many Forms, but is this required?
>>> from cltk.morphology.universal_dependencies_features import Case, Gender, Number, POS
>>> noun = POS.noun
>>> nominative = Case.nominative
>>> feminine = Gender.feminine
>>> singular = Number.singular
>>> cltk_word = Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos=noun, lemma='Gallia', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=1, features={Case: [nominative], Gender: [feminine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, stop=False, named_entity='LOCATION', syllables=None, phonetic_transcription=None, definition='') # doctest: +SKIP
>>> cltk_word.features[Case] = Case.nominative # doctest: +SKIP
>>> cltk_word.features[Gender] = Gender.feminine # doctest: +SKIP
>>> cltk_word.features[Number] = Number.singular # doctest: +SKIP
>>> f = Form.to_form(cltk_word) # doctest: +SKIP
>>> f.full_str() # doctest: +SKIP
'Gallia_0 [lemma=mallis,pos=NOUN,upos=NOUN,xpos=A1|grn1|casA|gen2,Case=nominative,Gender=feminine,Number=singular]'
"""
form = Form(word.string, form_id=word.index_token)
form.set("lemma", word.lemma)
form.set("pos", str(word.pos))
form.set("upos", word.upos)
form.set("xpos", word.xpos)
for feature_name, feature_values in word.features.all():
if feature_values is None:
print(word.stanza_features)
print(word.features)
form.set(str(feature_name), str(feature_values[0]))
return form
[docs]class Dependency:
"""The asymmetric binary relationship (or edge) between a governing
Form (the "head") and a subordinate Form (the "dependent").
In principle the relationship could capture any form-to-form relation
that the systems deems of interest, be it syntactic, semantic, or discursive.
If the `relation` attribute is not speficied, then the dependency simply states
that there's some asymmetric relationship between the head and the dependenent.
This is an *untyped* dependency.
For a *typed* dependency, a string value is supplied for the `relation` attribute.
"""
def __init__(self, head: Form, dep: Form, relation: str = None) -> None:
self.head = head
self.dep = dep
self.relation = relation
def __str__(self) -> str:
return "{0}({1}, {2})".format(
self.relation if self.relation else "", self.head, self.dep
)
__repr__ = __str__
def __or__(self, relation: str) -> "Dependency":
self.relation = relation
self.dep.set("relation", relation)
return self
[docs]class DependencyTree(ElementTree):
"""The hierarchical tree representing the entirety of a parse."""
def __init__(self, root: Form) -> None:
root.set("relation", "root")
ElementTree.__init__(self, root)
[docs] def get_dependencies(self) -> List[Dependency]:
"""Returns a list of all the dependency relations in the tree,
generated by depth-first search.
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.dependency.processes import StanzaProcess
>>> process_stanza = StanzaProcess(language="lat")
>>> output_doc = process_stanza.run(Doc(raw=get_example_text("lat")))
>>> a_sentence = output_doc.sentences[0]
>>> t = DependencyTree.to_tree(a_sentence)
>>> len(t.get_dependencies())
34
"""
def _get_deps(node: Form, deps: List[Dependency]) -> List[Dependency]:
for child_node in list(node):
deps = _get_deps(child_node, deps)
deps.extend(node.get_dependencies(child_node("relation")))
return deps
deps = _get_deps(self.getroot(), [])
deps.append(Dependency(None, self.getroot(), "root"))
return deps
[docs] def print_tree(self, all_features: bool = False):
"""Prints a pretty-printed (indented) representation
of the dependency tree. If all_features is True, then
each node is printed with its complete feature bundles.
"""
def _print_treelet(node: Form, indent: int, all_features: bool):
edge = "└─ " if indent > 0 else ""
node_str = node.full_str(False) if all_features else str(node)
print(" " * indent + edge + node("relation") + " | " + node_str)
for child_node in list(node):
_print_treelet(child_node, indent + 4, all_features)
_print_treelet(self.getroot(), indent=0, all_features=all_features)
[docs] @staticmethod
def to_tree(sentence: List[Word]) -> "DependencyTree":
"""Factory method to create trees from sentences parses, i.e. lists of words.
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.dependency.processes import StanzaProcess
>>> process_stanza = StanzaProcess(language="lat")
>>> output_doc = process_stanza.run(Doc(raw=get_example_text("lat")))
>>> a_sentence = output_doc.sentences[0]
>>> t = DependencyTree.to_tree(a_sentence)
>>> t.findall(".")
[divisa_3/adjective]
"""
forms = {} # type: Dict[int, Form]
for word in sentence:
forms[word.index_token] = Form.to_form(word)
root = None
for word in sentence:
if word.dependency_relation == "root":
root = forms[word.index_token]
elif word.governor != -1:
# only add a non-root element to the tree if it has a governor (i.e. not -1)
gov = forms[word.governor]
dep = forms[word.index_token]
gov >> dep | word.dependency_relation
return DependencyTree(root) if root else None