File size: 7,356 Bytes
c45d283 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
"""Encapsulates all word and punctuation symbols layer.
Layer 0 is the basic layer for all the UCCA annotation, as it includes the
actual words and punctuation marks found in the :class:`core`.Passage.
Layer 0 has only one type of node, :class:`Terminal`. This is a subtype of
:class:`core`.Node, and can have one of two tags: Word or Punctuation.
"""
from ucca import core
LAYER_ID = '0'
class NodeTags:
Punct = 'Punctuation'
Word = 'Word'
__init__ = None
ATTRIB_KEYS = ('text', 'paragraph', 'paragraph_position')
class Terminal(core.Node):
"""Layer 0 Node type, represents a word or a punctuation mark.
Terminals are :class:`core`.Node objects which represent a word or
a punctuation mark in the :class:`core`.Passage object. They are immutable,
as they shouldn't be changed throughout their use and have no children.
Hence, they can be compared and hashed, unlike other core.Node subclasses.
Attributes:
ID: the unique ID of each Terminal is its global position in the
Passage, e.g. ID=0.4 is the 4th Terminal in the :class:`Passage`.
tag: from NodeTags
layer: '0' (LAYER_ID)
attrib: returns a copy of the attribute dictionary, so changing it
will not affect the Terminal object
text: text of the Terminal, whether punctuation or a word
position: global position of the Terminal in the passage, starting at 1
paragraph: which paragraph the Terminal belongs to, starting at 1
para_pos: the position of the Terminal in the paragraph,
starting at 1 (per paragraph).
punct: whether the Terminal is a punctuation mark (boolean)
"""
@property
def text(self):
return self.attrib['text']
@property
def position(self):
# the format of ID is LAYER_ID + ID separator + position
return int(self.ID[len(LAYER_ID) + len(core.Node.ID_SEPARATOR):])
@property
def para_pos(self):
return self.attrib['paragraph_position']
@property
def paragraph(self):
return self.attrib['paragraph']
@property
def tok(self):
try:
return self.layer.extra["doc"][self.paragraph - 1][self.para_pos - 1]
except (KeyError, IndexError):
return None
def get_annotation(self, attr, as_array=False):
return attr(self.tok[attr.value]) if as_array else self.extra.get(attr.key)
@property
def attrib(self):
return self._attrib.copy()
@property
def punct(self):
return self.tag == NodeTags.Punct
def get_terminals(self, punct=True, *args, **kwargs):
"""Returns a list containing just this Terminal.
:param punct: whether to include punctuation Terminals, defaults to True
:return: a list of :class:`layer0`.Terminal objects
"""
del args, kwargs
return [] if self.punct and not punct else [self]
def equals(self, other, *, ordered=False, **kwargs):
"""Equals if the Terminals are of the same Layer, tag, position & text.
:param other: another Terminal to equal to
:param ordered: unused, here for API conformity.
:return: True iff the two Terminals are equal.
"""
return (self.layer.ID == other.layer.ID and self.text == other.text
and self.position == other.position and self.tag == other.tag
and self.paragraph == other.paragraph
and self.para_pos == other.para_pos)
def __eq__(self, other):
"""Equals if both of the same Passage, Layer, position, tag & text."""
if other.layer.ID != LAYER_ID:
return False
return (self.root == other.root and self.layer.ID == other.layer.ID
and self.position == other.position
and self.text == other.text and self.tag == other.tag
and self.paragraph == other.paragraph
and self.para_pos == other.para_pos)
def __hash__(self):
"""Hashes the Terminals according to its ID and text."""
return hash(self.ID + str(self.text))
def __str__(self):
return self.text
# Terminal are immutable (except the extra dictionary which is
# just a temporary playground) and have no children, so enforce it
def add(self, *args, **kwargs):
raise NotImplementedError()
def remove(self, *args, **kwargs):
raise NotImplementedError()
class Layer0(core.Layer):
"""Represents the :class:`Terminal` objects layer.
Attributes:
words: a tuple of only the words (not punctuation) Terminals, ordered
pairs: a tuple of (position, terminal) tuples of all Terminals, ordered
"""
def __init__(self, root, attrib=None):
super().__init__(ID=LAYER_ID, root=root, attrib=attrib)
@property
def words(self):
return tuple(x for x in self._all if not x.punct)
@property
def pairs(self):
return tuple(enumerate(self._all, start=1))
def by_position(self, pos):
"""Returns the Terminals at the position given.
:param pos: the position of the Terminal object
:return: the Terminal in this position
:raise IndexError: if the position is out of bounds
"""
return self._all[pos - 1] # positions start at 1, not 0
def add_terminal(self, text, punct, paragraph=1):
"""Adds the next Terminal at the next available position.
Creates a :class:`Terminal` object with the next position, assuming that
all positions are filled (no holes).
:param text: the text of the Terminal
:param punct: boolean, whether it's a punctuation mark
:param paragraph: paragraph number, defaults to 1
:return: the created Terminal
:raise DuplicateIdError: if trying to add an already existing Terminal,
caused by un-ordered Terminal positions in the layer
"""
position = len(self._all) + 1 # we want positions to start with 1
para_pos = self._all[-1].para_pos + 1 if position > 1 and paragraph == self._all[-1].paragraph else 1
tag = NodeTags.Punct if punct else NodeTags.Word
return Terminal(ID="{}{}{}".format(LAYER_ID, core.Node.ID_SEPARATOR, position),
root=self.root, tag=tag,
attrib={'text': text,
'paragraph': paragraph,
'paragraph_position': para_pos})
def copy(self, other_passage):
"""Creates a copied Layer0 object and Terminals in other_passage.
:param other_passage: the Passage to copy self to
"""
other = Layer0(root=other_passage, attrib=self.attrib.copy())
other.extra = self.extra.copy()
for t in self._all:
copied = other.add_terminal(t.text, t.punct, t.paragraph)
copied.extra = t.extra.copy()
def docs(self, num_paragraphs=1):
docs = self.extra.setdefault("doc", [[]])
while len(docs) < num_paragraphs:
docs.append([])
return docs
def doc(self, paragraph):
return self.docs(paragraph)[paragraph - 1]
def is_punct(node):
"""Returns whether the unit is a layer0 punctuation (for all Units)."""
return node.layer.ID == LAYER_ID and node.punct
|