Source code for cltk.tag.treebanks
"""Generate a Python dict from input tags from a treebank, in str. As of this version, only treebanks following the Penn notation are supported.
"""
[docs]def set_path(dicts, keys, v):
"""Helper function for modifying nested dictionaries
:param dicts: dict: the given dictionary
:param keys: list str: path to added value
:param v: str: value to be added
>>> d = dict()
>>> set_path(d, ['a', 'b', 'c'], 'd')
>>> d
{'a': {'b': {'c': ['d']}}}
In case of duplicate paths, the additional value will
be added to the leaf node rather than simply replace it:
>>> set_path(d, ['a', 'b', 'c'], 'e')
>>> d
{'a': {'b': {'c': ['d', 'e']}}}
"""
for key in keys[:-1]:
dicts = dicts.setdefault(key, dict())
dicts = dicts.setdefault(keys[-1], list())
dicts.append(v)
[docs]def get_paths(src):
"""Generates root-to-leaf paths, given a treebank in string format. Note that
get_path is an iterator and does not return all the paths simultaneously.
:param src: str: treebank
"""
st = list()
tmp = ""
for let in src:
if let == "(":
if tmp != "":
st.append(tmp)
tmp = ""
elif let == ")":
if tmp != "":
st.append(tmp)
yield st
st = st[: -1 - (tmp != "")]
tmp = ""
elif let == " ":
if tmp != "":
st.append(tmp)
tmp = ""
else:
tmp += let
[docs]def parse_treebanks(st):
"""Returns the corresponding tree of the treebank, in the form of
a nested dictionary
:param st: str: treebank using Penn notation
"""
d = dict()
for path in get_paths(st):
set_path(d, path[:-1], path[-1])
return d