Spaces:
Sleeping
Sleeping
new version
Browse files- demo.py +75 -66
- pipeline/__pycache__/post_processors.cpython-310.pyc +0 -0
- pipeline/post_processors.py +557 -257
- requirements.txt +1 -1
- utils/__pycache__/visualize.cpython-310.pyc +0 -0
- utils/visualize.py +25 -7
demo.py
CHANGED
@@ -16,32 +16,34 @@ from utils.visualize import visualize_spans
|
|
16 |
# "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
|
17 |
# )
|
18 |
|
19 |
-
#Load from local storage
|
20 |
-
#MODEL_LIST = ['en_engagement_RoBERTa-ME-AtoE.tar.gz']
|
21 |
|
22 |
-
#model = st.selectbox('Select model', MODEL_LIST, index=0)
|
23 |
-
#nlp = spacy.load("packages/" + model)
|
24 |
|
25 |
# Load from huggingface
|
26 |
# sm = spacy.load('en_core_web_sm', disable=['ner'])
|
27 |
|
28 |
-
st.set_page_config(
|
29 |
-
|
30 |
-
|
|
|
|
|
31 |
|
32 |
|
33 |
-
@st.
|
34 |
def load_model():
|
35 |
# nlp = spacy.load("en_engagement_RoBERTa_context_flz")
|
36 |
nlp = spacy.load("en_engagement_LSTM")
|
37 |
# nlp = spacy.load("en_engagement_spl_RoBERTa_base_attention")
|
38 |
-
return
|
39 |
|
40 |
|
41 |
nlp = load_model()
|
42 |
|
43 |
doc = nlp(
|
44 |
-
|
45 |
)
|
46 |
|
47 |
# TPL_ENT = """
|
@@ -93,44 +95,44 @@ TPL_SPAN_START = """
|
|
93 |
DEFAULT_TEXT = """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia."""
|
94 |
|
95 |
TEXT_LIST = [
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
According to George et al. (2011), in the UK immigration has improved the academic performance of the native children.
|
107 |
-
According to UNICEF (2011) a child that is breastfed within the first hour of life is fourteen times less likely to die from diarrhoea or pneumonia.
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
]
|
122 |
|
123 |
|
124 |
-
@st.
|
125 |
def preprocess(text):
|
126 |
-
text = re.sub("\n\n",
|
127 |
-
text = re.sub(
|
128 |
-
text = re.sub(
|
129 |
-
text = re.sub(
|
130 |
return text
|
131 |
|
132 |
|
133 |
-
@st.
|
134 |
def delete_span(span_sc: dict):
|
135 |
id_del = []
|
136 |
for n, spn in enumerate(span_sc, start=1):
|
@@ -171,7 +173,7 @@ with st.sidebar:
|
|
171 |
| `Monogloss` | An utterance which does not employ any value of engagement. Such an utterance ignores the dialogic potential in an utterance. |
|
172 |
|
173 |
""")
|
174 |
-
#For a more complete description of the category, visit [the annotation guideline](https://egumasa.github.io/engagement-annotation-project/3_Categories/)!!
|
175 |
|
176 |
st.sidebar.markdown("""
|
177 |
Engagement Analyzer is developed by [Masaki Eguchi](https://masakieguchi.weebly.com).
|
@@ -211,45 +213,45 @@ with st.expander("See more explanation"):
|
|
211 |
|
212 |
""")
|
213 |
|
214 |
-
st.info(
|
215 |
The current version was trained on 2,519 sentences and tested on 443 sentences. It achieved the following benchmark:
|
216 |
- Macro F1 = .75
|
217 |
- Macro Precision = .78
|
218 |
- Macro Recall = .74
|
219 |
I expect that the model's performance improves as the annotated dataset gets larger.
|
220 |
-
|
221 |
|
222 |
with st.form("my_form"):
|
223 |
-
|
224 |
st.subheader("Option 1: selecting example text from list")
|
225 |
-
text = st.selectbox(
|
226 |
|
227 |
st.subheader("Option 2: analyze your own text")
|
228 |
input_text = st.text_area(
|
229 |
label="",
|
230 |
-
value=
|
231 |
-
|
232 |
-
|
233 |
st.text(
|
234 |
-
|
235 |
)
|
236 |
|
237 |
textmode = st.radio(
|
238 |
-
label=
|
239 |
-
options=[
|
240 |
-
index=1
|
|
|
241 |
|
242 |
submitted = st.form_submit_button("Submit")
|
243 |
if submitted:
|
244 |
-
if textmode ==
|
245 |
text = input_text
|
246 |
-
with st.spinner(
|
247 |
doc = nlp(preprocess(text))
|
248 |
-
#st.markdown("> " + input_text)
|
249 |
else:
|
250 |
-
with st.spinner(
|
251 |
doc = nlp(preprocess(text))
|
252 |
-
#st.markdown("> " + text)
|
253 |
|
254 |
## Dependency parsing
|
255 |
|
@@ -265,21 +267,21 @@ with st.form("my_form"):
|
|
265 |
# st.write(text)
|
266 |
# delete_span(doc.spans['sc'])
|
267 |
|
268 |
-
cleanup_justify(doc, doc.spans[
|
269 |
-
delete_overlapping_span(doc.spans[
|
270 |
|
271 |
visualize_spans(
|
272 |
doc,
|
273 |
spans_key="sc",
|
274 |
displacy_options={
|
275 |
-
|
276 |
"span": TPL_SPAN,
|
277 |
-
|
278 |
-
|
279 |
},
|
280 |
"colors": {
|
281 |
"ENTERTAIN": "#82b74b",
|
282 |
-
"DENY":
|
283 |
"COUNTER": "#eea29a",
|
284 |
"PRONOUNCE": "#92a8d1",
|
285 |
"ENDORSE": "#034f84",
|
@@ -291,10 +293,12 @@ visualize_spans(
|
|
291 |
"CITATION": "#F8C471",
|
292 |
"SOURCES": "#F7DC6F",
|
293 |
"JUSTIFYING": "#2ECC71",
|
294 |
-
"ENDOPHORIC": "#FAD7A0"
|
295 |
},
|
296 |
},
|
297 |
-
simple
|
|
|
|
|
298 |
)
|
299 |
|
300 |
st.subheader("Bibliography")
|
@@ -304,4 +308,9 @@ st.markdown("""
|
|
304 |
* Ryshina-Pankova, M. (2014). Exploring academic argumentation in course-related blogs through ENGAGEMENT. In G. Thompson & L. Alba-Juez (Eds.), _Pragmatics & Beyond New Series (Vol. 242, pp. 281–302)_. John Benjamins Publishing Company. https://doi.org/10.1075/pbns.242.14rys
|
305 |
* Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
|
306 |
|
307 |
-
""")
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
|
17 |
# )
|
18 |
|
19 |
+
# Load from local storage
|
20 |
+
# MODEL_LIST = ['en_engagement_RoBERTa-ME-AtoE.tar.gz']
|
21 |
|
22 |
+
# model = st.selectbox('Select model', MODEL_LIST, index=0)
|
23 |
+
# nlp = spacy.load("packages/" + model)
|
24 |
|
25 |
# Load from huggingface
|
26 |
# sm = spacy.load('en_core_web_sm', disable=['ner'])
|
27 |
|
28 |
+
st.set_page_config(
|
29 |
+
page_title="ENGAGEMENT analyzer (beta ver 0.3)",
|
30 |
+
layout="wide",
|
31 |
+
initial_sidebar_state="expanded",
|
32 |
+
)
|
33 |
|
34 |
|
35 |
+
@st.cache_resource()
|
36 |
def load_model():
|
37 |
# nlp = spacy.load("en_engagement_RoBERTa_context_flz")
|
38 |
nlp = spacy.load("en_engagement_LSTM")
|
39 |
# nlp = spacy.load("en_engagement_spl_RoBERTa_base_attention")
|
40 |
+
return nlp
|
41 |
|
42 |
|
43 |
nlp = load_model()
|
44 |
|
45 |
doc = nlp(
|
46 |
+
"Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here."
|
47 |
)
|
48 |
|
49 |
# TPL_ENT = """
|
|
|
95 |
DEFAULT_TEXT = """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia."""
|
96 |
|
97 |
TEXT_LIST = [
|
98 |
+
"""To a significant extent, individuals can be considered responsible for the rise of Hitler to power on the 31st of January, 1933. Hitler himself, the charismatic leader of the Nazi Party, as well as creator of Nazi policy, played a key role in his own rise to power. However, other individuals in government, such as Hindenburg and von Papen were influential in Hitler’s rise. To a small extent, other factors also enabled Hitler to rise to power such as the Depression and the weakness of the political system. Nevertheless to a significant extent, individuals can be held responsible for the rise of Adolf Hitler to power.""",
|
99 |
+
"""Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia.""",
|
100 |
+
"""Certainly, the argumentation is not without some faults. For example, the statement that “linking homosexuality to witches fulfills the same purpose” is not supported by references to the readings. It is not clear who was linking homosexuality to witches and in what context. Nevertheless, overall and in line with the general tendencies reported in the previous section, the author employs various contracting and expanding engagement resources successfully. However, a large part of the successful use of engagement resources seems to be related to how the author structures these strategies throughout the text, namely in a wave-like fashion: from acknowledging the opinions of others, to countering them by offering one’s own interpretation, to supporting it by acknowledging other sources.""",
|
101 |
+
"""As the centuries passed, accounts of witchcraft became more and more specific; details of witches’ ceremonies and oaths became more concrete and whatever the condemned humans confessed to was treated as fact. As discussants correctly pointed out, Bernardino of Siena, Martin Le Franc, and the anonymous author of the Errores Gazariorum all have an even more aggressive campaign against witches than did the authors of our previous readings. By depicting their rituals and customs, they look to paint the most grotesque picture of witches possible. Their frenzied accusations, were some of the main catalysts of the subsequent witch hunts.""",
|
102 |
+
"""The post labeled “Witchcraft as a Problem in Society” clearly explains the contribution that each text makes to the witch hunts. While two of the authors focused on describing, in full detail, the shocking and disturbing practices that witches partook of, the others tried to prove that the witch threat was real. These last texts sought to explain witchcraft so as to convince readers that witches actually existed. As all posts reiterate, the devil is definitely at the source of witchcraft.""",
|
103 |
+
"""The third part temporarily puts aside mediation analysis and shifts the discussion to moderation analysis. In Chapter 7, I show how a multiple regression model can be made more flexible by allowing one variable’s effect to depend linearly on another variable in the model. The resulting moderated multiple regression model allows an investigator to ascertain the extent to which X’s influence on outcome variable Y is contingent on or interacts with a moderator variable W.""",
|
104 |
+
"""For instance, research has shown that people have a tendency to justify close others’ unethical actions to protect them (Gino and Galinsky 2012). Research has also shown that parents who feel close to their children often adopt strict curfew practices (Elder et al., 1995). (EC-33)""",
|
105 |
+
"""Fitzpatrick and Pagani (2013) found that engagement skills in classroom behaviour at kindergarten were related with better math scores and academic success. (LC-0525-EN)""",
|
106 |
+
"""The COAG Reform Council (2013) indicated that when compared to other students, Australian Year 4 students who attended one year of ECEC services or programs gained 11 points higher in reading (LC-0471-MA). Preliminary evidence suggests that teaching children from low-income families using humanoid robots increases motivation, sense of community, and self-expression... (EC-64). These findings suggest that visual perception takes up only a small fraction of fixation durations. Specifically, Verdelhan (2010) proposes a two-country, one-good model in which each country has an exogenously specified i.i.d. consumption growth process. Waters & Baur (2003) suggest that children or adolescents who are overweight or obese suffer from social and psychological issues. (LC-0460-EN)""",
|
107 |
+
"""According to the Australian Bureau of Statistics (2008), the percentage of obese or overweight adults is a staggering 60%.
|
108 |
According to George et al. (2011), in the UK immigration has improved the academic performance of the native children.
|
109 |
+
According to UNICEF (2011) a child that is breastfed within the first hour of life is fourteen times less likely to die from diarrhoea or pneumonia.""",
|
110 |
+
"""As far as I am concerned, I do think globalization is good chance for China’s developing. From my point of view, I prefer to think that advantages of globalization outweighs disadvantages. """,
|
111 |
+
"""As we know, China has made great progress for these years. I think it is the result of globalization. We all know China is a fast-developing country. We can seethe great progress that China has made. """,
|
112 |
+
"""His idea was that an important ninth century bishop called John Anglicus may indeed have given birth to a child in full view of everyone on the streets of Rome, but that this bishop was not and never had been the pope. Of course, there is no evidence whatever for this, as Leibnitz himself well knew.""",
|
113 |
+
"""On the whole, however, when evaluating meanings metaphorically, the Chinese EFL learners hedge and qualify their statements subjectively, tempering the certainty and authority of their assertions rather than using the resources of interpersonal metaphor to reinforce and substantiate their arguments. These tendencies reveal a key area for pedagogical intervention. Namely, instruction could focus on the value of construing metaphors objectively to obscure the author as the source of the evaluation. Similarly, raising students’ awareness of the space of negotiation and the value of offering assertions on a cline of certainty (e.g., IT IS EVIDENT) rather than through exclusive declarations of shared knowledge (e.g., AS WE ALL KNOW) is critical for academic writing refinement. Instructional interventions such as these are key areas for further investigation.""",
|
114 |
+
"""Of the defendants involved in Utah Pie Company’s case only one seems to have emerged as exceptionally successful. However this success was not a factor of overwhelming market power, as can be seen by the dominant position of Mrs. Smith’s during this time, which had maintained a 39-45 percent market share over the corresponding period.""",
|
115 |
+
"""Because of the evidence presented by Tremblay and Tremblay, it would appear that mergers in the brewing industry would have been procompetitive because of economies of scale. However, allowing a firm to acquire more than 20% of the market in Wisconsin would give it too much power to charge higher prices, even if the merger would help lower total average costs.""",
|
116 |
+
"""Taken in whole, the economic evidence for grocery retailers in the decades after the Von’s decision suggests that increased concentration is pro-competitive and good for consumers, running contrary to the fears proposed by the Court.""",
|
117 |
+
"""The remedies that Justice Lewis Powell prescribed did not gain the desired effect, and I feel that they were not very effective in promoting competition. (Elan, S86)""",
|
118 |
+
"""There is the possibility for abuse if the producer sets different maximum prices for different retailers, allowing some to reap higher profits.""",
|
119 |
+
"""Such a program, with appropriate limits, would provide a balanced structure that would ensure quality patient care.""",
|
120 |
+
"""A recent survey of physician satisfaction by Harvard Medical School found that physician autonomy and the ability to provide high-quality care, not income, are most strongly associated with changes in job satisfaction . Thus, it seems reasonable to assume that health care providers would take advantage of the greater bargaining power to improve the quality of care. (Ken, S78-79)""",
|
121 |
+
"""It appears, then, that maximum price fixing does the greatest harm when set below a competitive level [evidentialize]. In Case 4 it could potentially do harm to small retailers trying to enter the market [suggest], but does so for the benefit of consumers and the producer. Based purely on the models, it appears that, at the very least, maximum prices deserve a Rule of Reason approach to evaluate their cost and benefits.""",
|
122 |
+
"""It could be seen that for this 68% of the respondents, Tampines was characteristically a location that provided for them all their basic needs. It can be seen from chart [11] that many people quoted accessibility and proximity to home, and even shopping as one of the ideal factors that drew them there. Accessibility is quite a key factor because it is evident that the regional centre was built on the basis of good infrastructure. In comparison, 32% of the respondents felt that the conventional downtown was still a major attraction, even though the regional centre had gained quite a vast amount of popularity and did to large extent have an air of modernity.""",
|
123 |
]
|
124 |
|
125 |
|
126 |
+
@st.cache_resource()
|
127 |
def preprocess(text):
|
128 |
+
text = re.sub("\n\n", " &&&&&&&&#&#&#&#&", text)
|
129 |
+
text = re.sub("\n", " ", text)
|
130 |
+
text = re.sub("\s+", " ", text)
|
131 |
+
text = re.sub("&&&&&&&&#&#&#&#&", "\n\n", text)
|
132 |
return text
|
133 |
|
134 |
|
135 |
+
@st.cache_resource()
|
136 |
def delete_span(span_sc: dict):
|
137 |
id_del = []
|
138 |
for n, spn in enumerate(span_sc, start=1):
|
|
|
173 |
| `Monogloss` | An utterance which does not employ any value of engagement. Such an utterance ignores the dialogic potential in an utterance. |
|
174 |
|
175 |
""")
|
176 |
+
# For a more complete description of the category, visit [the annotation guideline](https://egumasa.github.io/engagement-annotation-project/3_Categories/)!!
|
177 |
|
178 |
st.sidebar.markdown("""
|
179 |
Engagement Analyzer is developed by [Masaki Eguchi](https://masakieguchi.weebly.com).
|
|
|
213 |
|
214 |
""")
|
215 |
|
216 |
+
st.info("""Updated on Jan.11th, 2023\n
|
217 |
The current version was trained on 2,519 sentences and tested on 443 sentences. It achieved the following benchmark:
|
218 |
- Macro F1 = .75
|
219 |
- Macro Precision = .78
|
220 |
- Macro Recall = .74
|
221 |
I expect that the model's performance improves as the annotated dataset gets larger.
|
222 |
+
""")
|
223 |
|
224 |
with st.form("my_form"):
|
|
|
225 |
st.subheader("Option 1: selecting example text from list")
|
226 |
+
text = st.selectbox("", TEXT_LIST)
|
227 |
|
228 |
st.subheader("Option 2: analyze your own text")
|
229 |
input_text = st.text_area(
|
230 |
label="",
|
231 |
+
value="I would strongly encourage you to put your texts here to analyze it for stance-taking expressions.",
|
232 |
+
height=120,
|
233 |
+
)
|
234 |
st.text(
|
235 |
+
"The text from the pull-down list and in the textbox cannot be analyzed at the same time. Please select the mode."
|
236 |
)
|
237 |
|
238 |
textmode = st.radio(
|
239 |
+
label="Choose the mode.",
|
240 |
+
options=["Option 1: Pull-down choice", "Option 2: My own text"],
|
241 |
+
index=1,
|
242 |
+
)
|
243 |
|
244 |
submitted = st.form_submit_button("Submit")
|
245 |
if submitted:
|
246 |
+
if textmode == "Option 2: My own text":
|
247 |
text = input_text
|
248 |
+
with st.spinner("Analysis in progress..."):
|
249 |
doc = nlp(preprocess(text))
|
250 |
+
# st.markdown("> " + input_text)
|
251 |
else:
|
252 |
+
with st.spinner("Analysis in progress..."):
|
253 |
doc = nlp(preprocess(text))
|
254 |
+
# st.markdown("> " + text)
|
255 |
|
256 |
## Dependency parsing
|
257 |
|
|
|
267 |
# st.write(text)
|
268 |
# delete_span(doc.spans['sc'])
|
269 |
|
270 |
+
cleanup_justify(doc, doc.spans["sc"])
|
271 |
+
delete_overlapping_span(doc.spans["sc"])
|
272 |
|
273 |
visualize_spans(
|
274 |
doc,
|
275 |
spans_key="sc",
|
276 |
displacy_options={
|
277 |
+
"template": {
|
278 |
"span": TPL_SPAN,
|
279 |
+
"slice": TPL_SPAN_SLICE,
|
280 |
+
"start": TPL_SPAN_START,
|
281 |
},
|
282 |
"colors": {
|
283 |
"ENTERTAIN": "#82b74b",
|
284 |
+
"DENY": "#c94c4c",
|
285 |
"COUNTER": "#eea29a",
|
286 |
"PRONOUNCE": "#92a8d1",
|
287 |
"ENDORSE": "#034f84",
|
|
|
293 |
"CITATION": "#F8C471",
|
294 |
"SOURCES": "#F7DC6F",
|
295 |
"JUSTIFYING": "#2ECC71",
|
296 |
+
"ENDOPHORIC": "#FAD7A0",
|
297 |
},
|
298 |
},
|
299 |
+
simple=False,
|
300 |
+
show_diversity=True,
|
301 |
+
show_confidence=False,
|
302 |
)
|
303 |
|
304 |
st.subheader("Bibliography")
|
|
|
308 |
* Ryshina-Pankova, M. (2014). Exploring academic argumentation in course-related blogs through ENGAGEMENT. In G. Thompson & L. Alba-Juez (Eds.), _Pragmatics & Beyond New Series (Vol. 242, pp. 281–302)_. John Benjamins Publishing Company. https://doi.org/10.1075/pbns.242.14rys
|
309 |
* Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
|
310 |
|
311 |
+
""")
|
312 |
+
|
313 |
+
st.subheader("Please cite the following papers:")
|
314 |
+
st.markdown("""* Eguchi, M., & Kyle, K. (2023). Span Identification of Epistemic Stance-Taking in Academic Written English. Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), 429–442. https://aclanthology.org/2023.bea-1.35
|
315 |
+
* Eguchi, M., & Kyle, K. (2024). Building custom NLP tools to annotate discourse-functional features for second language writing research: A tutorial. *Research Methods in Applied Linguistics, 3*(3), 100153. https://doi.org/10.1016/j.rmal.2024.100153
|
316 |
+
""")
|
pipeline/__pycache__/post_processors.cpython-310.pyc
CHANGED
Binary files a/pipeline/__pycache__/post_processors.cpython-310.pyc and b/pipeline/__pycache__/post_processors.cpython-310.pyc differ
|
|
pipeline/post_processors.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
|
3 |
import pandas as pd
|
4 |
import spacy
|
@@ -6,23 +5,38 @@ from spacy.language import Language
|
|
6 |
from skbio import diversity as dv
|
7 |
|
8 |
SPAN_ATTRS = ["text", "label_", "start", "end"]
|
9 |
-
CATEGORIES = [
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
columns = attrs + ["Conf. score"]
|
16 |
data = [
|
17 |
-
[str(getattr(span, attr))
|
18 |
-
|
19 |
-
|
|
|
20 |
]
|
21 |
return data, columns
|
22 |
|
23 |
|
24 |
# def span_info_aggregator()
|
25 |
|
|
|
26 |
def construction_classifier(doc, span):
|
27 |
category = None
|
28 |
spanroot = span.root
|
@@ -33,7 +47,6 @@ def construction_classifier(doc, span):
|
|
33 |
span_token = [t.norm_ for t in span]
|
34 |
span_tag = [t.tag_ for t in span]
|
35 |
|
36 |
-
|
37 |
c = [c for c in spanroot.children]
|
38 |
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
|
39 |
|
@@ -44,30 +57,65 @@ def construction_classifier(doc, span):
|
|
44 |
|
45 |
right_dep = [c.dep_ for c in spanroot.rights]
|
46 |
|
47 |
-
#conditionals
|
48 |
-
subjless = all(
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
## nesting classifiers
|
53 |
if spanroot.dep_ == "conj":
|
54 |
-
while spanroot.dep_ ==
|
55 |
spanroot = spanroot.head
|
56 |
# if spanroot.dep_ == "poss":
|
57 |
# while spanroot.dep_ == 'poss':
|
58 |
# spanroot = spanroot.head
|
59 |
|
60 |
-
## Conjunctions
|
61 |
# Preconjunctions
|
62 |
-
if spanroot.dep_ in [
|
63 |
category = "Conjunction"
|
64 |
|
65 |
## NOUN PHRASES
|
66 |
# adverbial phrases
|
67 |
-
if spanroot.dep_ in [
|
68 |
category = "Adjectival modifier"
|
69 |
# adverbial phrases
|
70 |
-
if spanroot.dep_ in [
|
71 |
category = "Compound noun"
|
72 |
|
73 |
## Nominal category
|
@@ -85,21 +133,24 @@ def construction_classifier(doc, span):
|
|
85 |
|
86 |
## ADJUNCTS
|
87 |
# prep phrases
|
88 |
-
if spanroot.dep_ in [
|
89 |
-
category =
|
90 |
# adverbial phrases
|
91 |
-
if spanroot.dep_ in [
|
92 |
category = "Adverbial phrase"
|
93 |
|
94 |
## Predication patterns
|
95 |
-
if spanroot.dep_ in [
|
96 |
if "xcomp" in c_dep:
|
97 |
category = "Subject predicate to-cl"
|
98 |
else:
|
99 |
category = "Adjectival complement"
|
100 |
|
101 |
-
if spanroot.dep_ in [
|
102 |
-
subjless = all(
|
|
|
|
|
|
|
103 |
|
104 |
c_head = [c.dep_ for c in spanroot.head.children]
|
105 |
if "expl" in c_head and "no_det" in span_t_dep_:
|
@@ -108,86 +159,115 @@ def construction_classifier(doc, span):
|
|
108 |
category = "There is/are + Noun complement"
|
109 |
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
|
110 |
category = "There is/are + Noun complement"
|
111 |
-
|
112 |
elif spanroot.pos_ in ["NOUN", "PRON"]:
|
113 |
if "acl" in c_dep:
|
114 |
category = "Noun + Complement (attr)"
|
115 |
else:
|
116 |
category = "Nominal complement"
|
117 |
|
118 |
-
elif not subjless and spanroot.pos_ in [
|
119 |
category = "Main verb 4"
|
120 |
|
121 |
-
elif spanroot.tag_ in [
|
122 |
category = "Nominal complement"
|
123 |
|
124 |
-
|
125 |
####################################
|
126 |
### clausal ####
|
127 |
####################################
|
128 |
-
if spanroot.dep_ in ["ROOT", "advcl", "ccomp",
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
## Start with broad category, which is then re-evaluated for specific constructions.
|
138 |
-
if spanroot.dep_ in [
|
139 |
## Adverbial clauses
|
140 |
### Finite-adverbial clauses
|
141 |
### Non-finite adverbial clauses
|
142 |
-
subjless = all(
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
146 |
category = "Finite adverbial clause"
|
147 |
-
elif "mark" in span_dep and "aux" in span_dep
|
148 |
category = "Finite adverbial clause"
|
149 |
|
150 |
-
elif
|
|
|
|
|
|
|
|
|
151 |
category = "Finite adverbial clause"
|
152 |
|
153 |
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
|
154 |
-
if spanroot.pos_ in [
|
155 |
category = "Finite adverbial clause"
|
156 |
|
157 |
-
elif spanroot.pos_ not in [
|
158 |
category = "Non-finite adv clause 1"
|
159 |
|
160 |
elif entire_cl:
|
161 |
category = "Finite adverbial clause"
|
162 |
|
163 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
# he doing his job
|
165 |
if argmentless:
|
166 |
-
#e.g., frankly speaking, strictly speaking
|
167 |
category = "Adverbial Phrase"
|
168 |
else:
|
169 |
category = "Non-finite adv clause 2"
|
170 |
|
171 |
-
elif
|
172 |
-
|
|
|
173 |
category = "Non-finite adv clause 3"
|
174 |
-
|
175 |
elif "aux" in c_dep and "TO" in c_tag:
|
176 |
category = "Adverbial Phrase"
|
177 |
|
178 |
-
|
179 |
-
elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
|
180 |
category = "Dependent Verb phrase"
|
181 |
-
|
182 |
-
elif not argmentless:
|
183 |
-
category = "Adverbial clause"
|
184 |
-
|
185 |
-
elif spanroot.dep_ == "advcl":
|
186 |
-
category = "Adverbial phrase"
|
187 |
|
|
|
|
|
188 |
|
189 |
-
|
|
|
190 |
|
|
|
191 |
head = spanroot.head
|
192 |
if ";" in [t.norm_ for t in head.children]:
|
193 |
category = "Main verb 3"
|
@@ -195,13 +275,20 @@ def construction_classifier(doc, span):
|
|
195 |
category = "Dependent verb 1"
|
196 |
elif "mark" in span_dep:
|
197 |
category = "Complement clause"
|
198 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
category = "Non-finite complement clause"
|
200 |
-
elif spanroot.dep_ in [
|
201 |
category = "Relative clause"
|
202 |
-
elif spanroot.dep_ in [
|
203 |
category = "Complement clause"
|
204 |
-
elif spanroot.dep_ in [
|
205 |
category = "Noun Complement clause"
|
206 |
else:
|
207 |
# print(_check_for_to)
|
@@ -209,55 +296,78 @@ def construction_classifier(doc, span):
|
|
209 |
|
210 |
## Specific constructions
|
211 |
# Extraposed that-clause or to-infinitives
|
212 |
-
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
|
|
|
|
|
|
|
213 |
print(c_dep)
|
214 |
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
|
215 |
-
#eg it seems odd (oprd) that X.
|
216 |
-
#eg it is certain (acomp) that X.
|
217 |
-
category =
|
|
|
|
|
218 |
|
219 |
elif "xcomp" in c_dep or ("advcl" in c_dep):
|
220 |
if "for_mark" in _check_for_to:
|
221 |
-
category =
|
|
|
|
|
222 |
elif _check_to:
|
223 |
-
category = "Extraposed to-cl 1"
|
224 |
elif _check_ing:
|
225 |
-
category = "Extraposed -ing 1"
|
226 |
-
elif (
|
227 |
-
|
|
|
|
|
|
|
|
|
228 |
|
229 |
elif "attr" in c_dep:
|
230 |
-
category = "Extraposed that-cl (copula)"
|
231 |
|
232 |
else:
|
233 |
-
category = "Extraposed that-cl (VERB)"
|
234 |
|
235 |
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
236 |
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
|
237 |
-
elif (
|
|
|
|
|
238 |
if "xcomp" in c_dep:
|
239 |
if _check_to:
|
240 |
-
category = "Extraposed to-cl 2"
|
241 |
elif _check_ing:
|
242 |
-
category = "Extraposed -ing 2"
|
243 |
-
|
244 |
else:
|
245 |
category = "Extraposed that-cl (adj-complement) 2"
|
246 |
|
247 |
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
|
|
|
|
|
|
|
248 |
|
249 |
-
category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
|
250 |
-
|
251 |
-
|
252 |
# something without dummy subject "it"
|
253 |
-
elif (
|
254 |
-
|
|
|
|
|
|
|
255 |
# store xcomp, if the head of the xcomp is acomp
|
256 |
-
_check_xcomp = [
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
259 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
260 |
-
|
261 |
|
262 |
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
|
263 |
if any(root_before_ccomp):
|
@@ -271,13 +381,13 @@ def construction_classifier(doc, span):
|
|
271 |
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
|
272 |
category = "Post-predicate to-cl"
|
273 |
|
274 |
-
elif "xcomp" in c_dep and spanroot.lemma_ in [
|
275 |
category = "Subject predicate to-cl"
|
276 |
|
277 |
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
|
278 |
category = "Subject predicate to-cl (passive)"
|
279 |
|
280 |
-
elif "xcomp" in c_dep and spanroot.lemma_ in [
|
281 |
category = "Subject predicate -ing"
|
282 |
elif "ccomp" in c_dep:
|
283 |
category = "Subject predicate that-cl"
|
@@ -290,9 +400,27 @@ def construction_classifier(doc, span):
|
|
290 |
category = "Main verb 1"
|
291 |
|
292 |
## without dummy subject it, and lexical verbs
|
293 |
-
elif (
|
294 |
-
|
295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
298 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
@@ -315,27 +443,34 @@ def construction_classifier(doc, span):
|
|
315 |
# Existential
|
316 |
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
|
317 |
category = "There is/are NOUN"
|
318 |
-
|
319 |
-
elif "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]:
|
320 |
-
category = "Cleft construction"
|
321 |
|
|
|
|
|
|
|
|
|
322 |
|
323 |
-
if spanroot.dep_ in [
|
324 |
-
if "_".join(span_dep) in [
|
|
|
|
|
|
|
|
|
325 |
category = "Comment clause"
|
326 |
else:
|
327 |
category = "parataxis (for now)"
|
328 |
-
|
329 |
|
330 |
## External comp
|
331 |
-
if spanroot.dep_ in [
|
332 |
-
if spanroot.head.pos_ ==
|
333 |
category = "Adjective complement to-cl"
|
334 |
-
if spanroot.head.pos_ ==
|
335 |
category = "Verb complement to-cl"
|
336 |
-
|
337 |
-
if spanroot.dep_ in [
|
338 |
-
if
|
|
|
|
|
|
|
339 |
category = "Participle + that-cl"
|
340 |
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
341 |
category = "Participle"
|
@@ -345,25 +480,28 @@ def construction_classifier(doc, span):
|
|
345 |
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
346 |
# category = "Gerund"
|
347 |
|
348 |
-
if spanroot.dep_ in [
|
349 |
category = "Negative particle"
|
350 |
-
if spanroot.dep_ in [
|
351 |
category = "Auxiliary"
|
352 |
|
353 |
# Modal verbs
|
354 |
if spanroot.tag_ == "MD":
|
355 |
category = "Modal auxiliary"
|
356 |
|
357 |
-
|
358 |
-
|
359 |
-
|
|
|
|
|
|
|
360 |
if spanroot.morph == spanroot.head.morph:
|
361 |
category = "Main verb 4"
|
362 |
else:
|
363 |
category = "Dependent verb 2"
|
364 |
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
|
365 |
category = "Gerund"
|
366 |
-
elif spanroot.head.dep_ in [
|
367 |
if spanroot.morph == spanroot.head.morph:
|
368 |
category = "Main verb 4"
|
369 |
else:
|
@@ -372,7 +510,7 @@ def construction_classifier(doc, span):
|
|
372 |
category = "Dependent verb 2"
|
373 |
|
374 |
# Appositive phrases
|
375 |
-
if spanroot.dep_ in [
|
376 |
if "nummod" in c_dep:
|
377 |
category = "Apposition"
|
378 |
elif spanroot.pos_ in ["PROPN"]:
|
@@ -380,21 +518,23 @@ def construction_classifier(doc, span):
|
|
380 |
elif spanroot.pos_ in ["NOUN"]:
|
381 |
category = "Appositive Noun Phrase"
|
382 |
elif spanroot.pos_ in ["VERB", "AUX"]:
|
383 |
-
_check = any(
|
|
|
|
|
|
|
384 |
if _check:
|
385 |
category = "Appositive Finite-clause"
|
386 |
-
|
387 |
-
if spanroot.dep_ in [
|
388 |
-
if not subjless and spanroot.pos_ in [
|
389 |
category = "Main verb 5"
|
390 |
|
391 |
if spanroot.dep_ in ["dep", "mark"]:
|
392 |
if spanroot.tag_ in ["RB", "IN", "CC"]:
|
393 |
category = "Conjunction"
|
394 |
|
395 |
-
|
396 |
-
|
397 |
-
if spanroot.dep_ in ['aux', "auxpass", 'oprd', 'appos', "xcomp"]:
|
398 |
if spanroot.head.dep_ == "ROOT":
|
399 |
category = "Main verb"
|
400 |
else:
|
@@ -402,7 +542,7 @@ def construction_classifier(doc, span):
|
|
402 |
|
403 |
if span.label_ == "CITATION":
|
404 |
if "NNP" in span_tag or "NNPS" in span_tag:
|
405 |
-
if span_dep[0] ==
|
406 |
category = "Parenthetical Citation"
|
407 |
elif span_tag[0] in ["NNP", "NNPS"]:
|
408 |
category = "Narrative Citation"
|
@@ -425,7 +565,6 @@ def construction_classifier2(doc, span):
|
|
425 |
span_token = [t.norm_ for t in span]
|
426 |
span_tag = [t.tag_ for t in span]
|
427 |
|
428 |
-
|
429 |
c = [c for c in spanroot.children]
|
430 |
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
|
431 |
|
@@ -436,43 +575,92 @@ def construction_classifier2(doc, span):
|
|
436 |
|
437 |
right_dep = [c.dep_ for c in spanroot.rights]
|
438 |
|
439 |
-
#conditionals
|
440 |
-
subjless = all(
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
|
446 |
## nesting classifiers
|
447 |
if spanroot.dep_ == "conj":
|
448 |
-
while spanroot.dep_ ==
|
449 |
spanroot = spanroot.head
|
450 |
|
451 |
if spanroot.dep_ == "poss":
|
452 |
head = spanroot.head
|
453 |
-
if head.dep_ in ["pobj", "dobj", "obj", "iobj"
|
454 |
category = "Posessive Noun (Object)"
|
455 |
elif head.dep_ in ["nsubj", "nsubjpass"]:
|
456 |
category = "Posessive Noun (Subject)"
|
457 |
else:
|
458 |
category = "Posessive Noun (Other)"
|
459 |
|
460 |
-
|
461 |
-
## Conjunctions
|
462 |
# Preconjunctions
|
463 |
-
if spanroot.dep_ in [
|
464 |
category = "Conjunction"
|
465 |
|
466 |
## NOUN PHRASES
|
467 |
# adverbial phrases
|
468 |
-
if spanroot.dep_ in [
|
469 |
category = "Adjectival modifier"
|
470 |
# adverbial phrases
|
471 |
-
if spanroot.dep_ in [
|
472 |
category = "Compound noun"
|
473 |
|
474 |
## Nominal category
|
475 |
-
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj"
|
476 |
if "acl" in c_dep:
|
477 |
category = "Noun + Complement (Object)"
|
478 |
else:
|
@@ -486,22 +674,25 @@ def construction_classifier2(doc, span):
|
|
486 |
|
487 |
## ADJUNCTS
|
488 |
# prep phrases
|
489 |
-
if spanroot.dep_ in [
|
490 |
-
category =
|
491 |
|
492 |
# adverbial phrases
|
493 |
-
if spanroot.dep_ in [
|
494 |
category = "Adverbial phrase"
|
495 |
|
496 |
## Predication patterns
|
497 |
-
if spanroot.dep_ in [
|
498 |
if "xcomp" in c_dep:
|
499 |
category = "Subject predicate to-cl"
|
500 |
else:
|
501 |
category = "Adjectival complement"
|
502 |
|
503 |
-
if spanroot.dep_ in [
|
504 |
-
subjless = all(
|
|
|
|
|
|
|
505 |
|
506 |
c_head = [c.dep_ for c in spanroot.head.children]
|
507 |
if "expl" in c_head and "no_det" in span_t_dep_:
|
@@ -510,28 +701,31 @@ def construction_classifier2(doc, span):
|
|
510 |
category = "There is/are + Noun complement"
|
511 |
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
|
512 |
category = "There is/are + Noun complement"
|
513 |
-
|
514 |
elif spanroot.pos_ in ["NOUN", "PRON"]:
|
515 |
if "acl" in c_dep:
|
516 |
category = "Noun + Complement (attr)"
|
517 |
else:
|
518 |
category = "Nominal complement"
|
519 |
|
520 |
-
elif not subjless and spanroot.pos_ in [
|
521 |
category = "Main verb 4"
|
522 |
|
523 |
-
elif spanroot.tag_ in [
|
524 |
category = "Nominal complement"
|
525 |
|
526 |
## External comp
|
527 |
-
if spanroot.dep_ in [
|
528 |
-
if spanroot.head.pos_ ==
|
529 |
category = "Adjective complement to-cl"
|
530 |
-
if spanroot.head.pos_ ==
|
531 |
category = "Verb complement to-cl"
|
532 |
-
|
533 |
-
if spanroot.dep_ in [
|
534 |
-
if
|
|
|
|
|
|
|
535 |
category = "Participle + that-cl"
|
536 |
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
537 |
category = "Participle"
|
@@ -541,86 +735,117 @@ def construction_classifier2(doc, span):
|
|
541 |
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
542 |
# category = "Gerund"
|
543 |
|
544 |
-
if spanroot.dep_ in [
|
545 |
category = "Negative particle"
|
546 |
-
if spanroot.dep_ in [
|
547 |
category = "Auxiliary"
|
548 |
|
549 |
# Modal verbs
|
550 |
if spanroot.tag_ == "MD":
|
551 |
category = "Modal auxiliary"
|
552 |
|
553 |
-
|
554 |
####################################
|
555 |
### clausal ####
|
556 |
####################################
|
557 |
-
if spanroot.dep_ in ["ROOT", "advcl", "ccomp",
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
566 |
|
567 |
## Start with broad category, which is then re-evaluated for specific constructions.
|
568 |
-
if spanroot.dep_ in [
|
569 |
## Adverbial clauses
|
570 |
-
subjless = all(
|
571 |
-
|
|
|
|
|
|
|
|
|
|
|
572 |
|
573 |
### Finite-adverbial clauses
|
574 |
-
if "mark" in span_dep and (
|
|
|
|
|
575 |
category = "Finite adverbial clause"
|
576 |
|
577 |
-
elif "mark" in span_dep and "aux" in span_dep
|
578 |
category = "Finite adverbial clause"
|
579 |
|
580 |
-
elif
|
|
|
|
|
|
|
|
|
581 |
category = "Finite adverbial clause"
|
582 |
|
583 |
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
|
584 |
-
if spanroot.pos_ in [
|
585 |
category = "Finite adverbial clause"
|
586 |
|
587 |
-
elif spanroot.pos_ not in [
|
588 |
category = "Non-finite adv clause 1"
|
589 |
|
590 |
elif not argmentless:
|
591 |
-
category =
|
592 |
|
593 |
## non-finite
|
594 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
595 |
# he doing his job
|
596 |
if argmentless:
|
597 |
-
#e.g., frankly speaking, strictly speaking
|
598 |
category = "Adverbial Phrase"
|
599 |
else:
|
600 |
category = "Non-finite adv clause 2"
|
601 |
|
602 |
-
elif
|
603 |
-
|
|
|
604 |
category = "Non-finite adv clause 3"
|
605 |
-
|
606 |
elif "aux" in c_dep and "TO" in c_tag:
|
607 |
category = "Adverbial Phrase"
|
608 |
|
609 |
-
|
610 |
-
elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
|
611 |
category = "Dependent Verb phrase"
|
612 |
-
|
613 |
elif not argmentless:
|
614 |
-
category =
|
615 |
-
|
616 |
elif spanroot.dep_ == "advcl":
|
617 |
-
category =
|
618 |
-
|
619 |
else:
|
620 |
category = "Finite adverbial clause "
|
621 |
|
622 |
-
if spanroot.dep_ in [
|
623 |
-
|
624 |
head = spanroot.head
|
625 |
if ";" in [t.norm_ for t in head.children]:
|
626 |
category = "Main verb 3"
|
@@ -630,66 +855,96 @@ def construction_classifier2(doc, span):
|
|
630 |
|
631 |
elif "mark" in span_dep:
|
632 |
category = "Complement clause"
|
633 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
634 |
category = "Non-finite complement clause"
|
635 |
-
elif spanroot.dep_ in [
|
636 |
category = "Relative clause"
|
637 |
-
elif spanroot.dep_ in [
|
638 |
category = "Complement clause"
|
639 |
-
elif spanroot.dep_ in [
|
640 |
category = "Noun Complement clause"
|
641 |
|
642 |
## Specific constructions
|
643 |
# Extraposed that-clause or to-infinitives
|
644 |
-
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
|
|
|
|
|
|
|
645 |
# print(c_dep)
|
646 |
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
|
647 |
-
#eg it seems odd (oprd) that X.
|
648 |
-
#eg it is certain (acomp) that X.
|
649 |
-
category =
|
|
|
|
|
650 |
|
651 |
elif "xcomp" in c_dep or ("advcl" in c_dep):
|
652 |
if "for_mark" in _check_for_to:
|
653 |
-
category =
|
|
|
|
|
654 |
elif _check_to:
|
655 |
-
category = "Extraposed to-cl 1"
|
656 |
elif _check_ing:
|
657 |
-
category = "Extraposed -ing 1"
|
658 |
-
elif (
|
659 |
-
|
|
|
|
|
|
|
|
|
660 |
|
661 |
elif "attr" in c_dep:
|
662 |
-
category = "Extraposed that-cl (copula)"
|
663 |
|
664 |
else:
|
665 |
-
category = "Extraposed that-cl (VERB)"
|
666 |
|
667 |
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
668 |
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
|
669 |
-
elif (
|
|
|
|
|
670 |
if "xcomp" in c_dep:
|
671 |
if _check_to:
|
672 |
-
category = "Extraposed to-cl 2"
|
673 |
elif _check_ing:
|
674 |
-
category = "Extraposed -ing 2"
|
675 |
-
|
676 |
else:
|
677 |
category = "Extraposed that-cl (adj-complement) 2"
|
678 |
|
679 |
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
|
|
|
|
|
|
|
680 |
|
681 |
-
category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
|
682 |
-
|
683 |
-
|
684 |
# something without dummy subject "it"
|
685 |
-
elif (
|
686 |
-
|
|
|
|
|
|
|
687 |
# store xcomp, if the head of the xcomp is acomp
|
688 |
-
_check_xcomp = [
|
689 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
690 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
691 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
692 |
-
|
693 |
|
694 |
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
|
695 |
if any(root_before_ccomp):
|
@@ -703,13 +958,13 @@ def construction_classifier2(doc, span):
|
|
703 |
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
|
704 |
category = "Post-predicate to-cl"
|
705 |
|
706 |
-
elif "xcomp" in c_dep and spanroot.lemma_ in [
|
707 |
category = "Subject predicate to-cl"
|
708 |
|
709 |
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
|
710 |
category = "Subject predicate to-cl (passive)"
|
711 |
|
712 |
-
elif "xcomp" in c_dep and spanroot.lemma_ in [
|
713 |
category = "Subject predicate -ing"
|
714 |
elif "ccomp" in c_dep:
|
715 |
category = "Subject predicate that-cl"
|
@@ -724,9 +979,27 @@ def construction_classifier2(doc, span):
|
|
724 |
category = "Main verb 1"
|
725 |
|
726 |
## without dummy subject it, and lexical verbs
|
727 |
-
elif (
|
728 |
-
|
729 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
730 |
|
731 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
732 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
@@ -746,40 +1019,48 @@ def construction_classifier2(doc, span):
|
|
746 |
elif _check_ing:
|
747 |
category = "Post-predicate -ing"
|
748 |
|
749 |
-
|
750 |
-
|
751 |
# Existential
|
752 |
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
|
753 |
category = "There is/are NOUN"
|
754 |
-
|
755 |
-
elif
|
|
|
|
|
756 |
category = "Cleft construction"
|
757 |
|
758 |
### The end of clausal analysis
|
759 |
-
|
760 |
-
if spanroot.dep_ in [
|
761 |
-
if "_".join(span_dep) in [
|
|
|
|
|
|
|
|
|
762 |
category = "Comment clause"
|
763 |
else:
|
764 |
category = "Parataxis"
|
765 |
-
|
766 |
|
767 |
-
if spanroot.dep_ in [
|
768 |
-
if
|
|
|
|
|
|
|
|
|
769 |
if spanroot.morph == spanroot.head.morph:
|
770 |
category = "Main verb 4"
|
771 |
else:
|
772 |
category = "Dependent verb 2"
|
773 |
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
|
774 |
category = "Gerund"
|
775 |
-
elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
|
|
|
|
|
776 |
category = "Dependent verb 2"
|
777 |
-
elif spanroot.dep_ in ["csubj",
|
778 |
category = "Dependent verb (csubj)"
|
779 |
|
780 |
-
|
781 |
# Appositive phrases
|
782 |
-
if spanroot.dep_ in [
|
783 |
if "nummod" in c_dep:
|
784 |
category = "Apposition"
|
785 |
if spanroot.pos_ in ["PROPN"]:
|
@@ -787,16 +1068,18 @@ def construction_classifier2(doc, span):
|
|
787 |
elif spanroot.pos_ in ["NOUN"]:
|
788 |
category = "Appositive Noun Phrase"
|
789 |
elif spanroot.pos_ in ["VERB", "AUX"]:
|
790 |
-
_check = any(
|
|
|
|
|
|
|
791 |
if _check:
|
792 |
category = "Appositive Finite-clause"
|
793 |
-
|
794 |
|
795 |
-
if spanroot.dep_ in [
|
796 |
-
if not subjless and spanroot.pos_ in [
|
797 |
category = "Main verb (likely parsing error)"
|
798 |
|
799 |
-
#sometimes the dep are on the conjunctions
|
800 |
if spanroot.dep_ in ["dep", "mark"]:
|
801 |
if spanroot.tag_ in ["RB", "IN", "CC"]:
|
802 |
category = "Conjunction"
|
@@ -804,9 +1087,12 @@ def construction_classifier2(doc, span):
|
|
804 |
if spanroot.dep_ in ["intj"]:
|
805 |
category = "Introjection"
|
806 |
|
807 |
-
|
808 |
-
|
809 |
-
|
|
|
|
|
|
|
810 |
if spanroot.head.dep_ == "ROOT":
|
811 |
category = "Main verb"
|
812 |
else:
|
@@ -814,7 +1100,7 @@ def construction_classifier2(doc, span):
|
|
814 |
|
815 |
if span.label_ == "CITATION":
|
816 |
if "NNP" in span_tag or "NNPS" in span_tag:
|
817 |
-
if span_dep[0] ==
|
818 |
category = "Parenthetical Citation"
|
819 |
elif span_tag[0] in ["NNP", "NNPS"]:
|
820 |
category = "Narrative Citation"
|
@@ -827,18 +1113,32 @@ def construction_classifier2(doc, span):
|
|
827 |
return category
|
828 |
|
829 |
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
836 |
data = []
|
837 |
# data = span_info_aggregator(doc, columns)
|
838 |
sentences = {s: i for i, s in enumerate(doc.sents)}
|
839 |
|
840 |
-
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs[
|
841 |
-
|
842 |
span_info = []
|
843 |
span_info.extend([str(getattr(span, attr)) for attr in attrs])
|
844 |
|
@@ -854,7 +1154,7 @@ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
|
854 |
span_info.append(span.root.head.norm_)
|
855 |
span_info.append(span.root.head.dep_)
|
856 |
span_info.append("_".join([c.dep_ for c in span.root.children]))
|
857 |
-
span_info.append(span.root.morph)
|
858 |
span_info.append(span.sent.text.strip())
|
859 |
|
860 |
data.append(span_info)
|
@@ -862,27 +1162,27 @@ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
|
862 |
return data, columns
|
863 |
|
864 |
|
865 |
-
def ngrammar(seq: list, n=2, concat
|
866 |
result = []
|
867 |
n_item = len(seq)
|
868 |
for idx, item in enumerate(seq):
|
869 |
if idx + n <= n_item:
|
870 |
if concat:
|
871 |
-
result.append(sep.join(seq[idx: idx + n]))
|
872 |
else:
|
873 |
-
result.append(seq[idx: idx + n])
|
874 |
return result
|
875 |
|
876 |
|
877 |
def diversity_values(count_vec: list):
|
878 |
result = {}
|
879 |
if len(count_vec) == 0:
|
880 |
-
count_vec = [0,0,0,0,0,0,0,0,0,0]
|
881 |
|
882 |
-
result[
|
883 |
-
result[
|
884 |
-
result["simpson_d"] = 1- dv.alpha.simpson(list(count_vec))
|
885 |
-
result[
|
886 |
# result['gini_index'] = dv.alpha.gini_index(list(count_vec))
|
887 |
# result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
|
888 |
|
|
|
|
|
1 |
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
|
2 |
import pandas as pd
|
3 |
import spacy
|
|
|
5 |
from skbio import diversity as dv
|
6 |
|
7 |
SPAN_ATTRS = ["text", "label_", "start", "end"]
|
8 |
+
CATEGORIES = [
|
9 |
+
"ATTRIBUTION",
|
10 |
+
"CITATION",
|
11 |
+
"COUNTER",
|
12 |
+
"DENY",
|
13 |
+
"ENDOPHORIC",
|
14 |
+
"ENTERTAIN",
|
15 |
+
"JUSTIFYING",
|
16 |
+
"MONOGLOSS",
|
17 |
+
"PROCLAIM",
|
18 |
+
"SOURCES",
|
19 |
+
]
|
20 |
+
|
21 |
+
|
22 |
+
def simple_table(
|
23 |
+
doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
24 |
+
spans_key: str = "sc",
|
25 |
+
attrs: List[str] = SPAN_ATTRS,
|
26 |
+
):
|
27 |
columns = attrs + ["Conf. score"]
|
28 |
data = [
|
29 |
+
[str(getattr(span, attr)) for attr in attrs] + [score] # [f'{score:.5f}']
|
30 |
+
for span, score in zip(
|
31 |
+
doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]
|
32 |
+
)
|
33 |
]
|
34 |
return data, columns
|
35 |
|
36 |
|
37 |
# def span_info_aggregator()
|
38 |
|
39 |
+
|
40 |
def construction_classifier(doc, span):
|
41 |
category = None
|
42 |
spanroot = span.root
|
|
|
47 |
span_token = [t.norm_ for t in span]
|
48 |
span_tag = [t.tag_ for t in span]
|
49 |
|
|
|
50 |
c = [c for c in spanroot.children]
|
51 |
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
|
52 |
|
|
|
57 |
|
58 |
right_dep = [c.dep_ for c in spanroot.rights]
|
59 |
|
60 |
+
# conditionals
|
61 |
+
subjless = all(
|
62 |
+
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
63 |
+
for c in spanroot.children
|
64 |
+
)
|
65 |
+
argmentless = all(
|
66 |
+
c.dep_
|
67 |
+
not in [
|
68 |
+
"nsubj",
|
69 |
+
"nsubjpass",
|
70 |
+
"csubj",
|
71 |
+
"csubjpass",
|
72 |
+
"dobj",
|
73 |
+
"ccomp",
|
74 |
+
"xcomp",
|
75 |
+
"dative",
|
76 |
+
"attr",
|
77 |
+
"oprd",
|
78 |
+
"acomp",
|
79 |
+
]
|
80 |
+
for c in spanroot.children
|
81 |
+
)
|
82 |
+
argless_span = all(
|
83 |
+
c.dep_
|
84 |
+
not in [
|
85 |
+
"nsubj",
|
86 |
+
"nsubjpass",
|
87 |
+
"csubj",
|
88 |
+
"csubjpass",
|
89 |
+
"dobj",
|
90 |
+
"ccomp",
|
91 |
+
"xcomp",
|
92 |
+
"dative",
|
93 |
+
"attr",
|
94 |
+
"oprd",
|
95 |
+
"acomp",
|
96 |
+
]
|
97 |
+
for c in span
|
98 |
+
)
|
99 |
|
100 |
## nesting classifiers
|
101 |
if spanroot.dep_ == "conj":
|
102 |
+
while spanroot.dep_ == "conj":
|
103 |
spanroot = spanroot.head
|
104 |
# if spanroot.dep_ == "poss":
|
105 |
# while spanroot.dep_ == 'poss':
|
106 |
# spanroot = spanroot.head
|
107 |
|
108 |
+
## Conjunctions
|
109 |
# Preconjunctions
|
110 |
+
if spanroot.dep_ in ["preconj", "cc"]:
|
111 |
category = "Conjunction"
|
112 |
|
113 |
## NOUN PHRASES
|
114 |
# adverbial phrases
|
115 |
+
if spanroot.dep_ in ["amod"]:
|
116 |
category = "Adjectival modifier"
|
117 |
# adverbial phrases
|
118 |
+
if spanroot.dep_ in ["compound"]:
|
119 |
category = "Compound noun"
|
120 |
|
121 |
## Nominal category
|
|
|
133 |
|
134 |
## ADJUNCTS
|
135 |
# prep phrases
|
136 |
+
if spanroot.dep_ in ["prep", "agent"]:
|
137 |
+
category = "Prepositional phrase"
|
138 |
# adverbial phrases
|
139 |
+
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]:
|
140 |
category = "Adverbial phrase"
|
141 |
|
142 |
## Predication patterns
|
143 |
+
if spanroot.dep_ in ["acomp", "oprd"]:
|
144 |
if "xcomp" in c_dep:
|
145 |
category = "Subject predicate to-cl"
|
146 |
else:
|
147 |
category = "Adjectival complement"
|
148 |
|
149 |
+
if spanroot.dep_ in ["attr"]:
|
150 |
+
subjless = all(
|
151 |
+
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
152 |
+
for c in spanroot.children
|
153 |
+
)
|
154 |
|
155 |
c_head = [c.dep_ for c in spanroot.head.children]
|
156 |
if "expl" in c_head and "no_det" in span_t_dep_:
|
|
|
159 |
category = "There is/are + Noun complement"
|
160 |
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
|
161 |
category = "There is/are + Noun complement"
|
162 |
+
|
163 |
elif spanroot.pos_ in ["NOUN", "PRON"]:
|
164 |
if "acl" in c_dep:
|
165 |
category = "Noun + Complement (attr)"
|
166 |
else:
|
167 |
category = "Nominal complement"
|
168 |
|
169 |
+
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
|
170 |
category = "Main verb 4"
|
171 |
|
172 |
+
elif spanroot.tag_ in ["NNP"]:
|
173 |
category = "Nominal complement"
|
174 |
|
|
|
175 |
####################################
|
176 |
### clausal ####
|
177 |
####################################
|
178 |
+
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]:
|
179 |
+
_check_to = [
|
180 |
+
c.dep_
|
181 |
+
for c in spanroot.subtree
|
182 |
+
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
|
183 |
+
and c.head.dep_ == "xcomp"
|
184 |
+
]
|
185 |
+
_check_ing = [
|
186 |
+
c.dep_
|
187 |
+
for c in spanroot.subtree
|
188 |
+
if "Prog" in str(c.morph) and c.dep_ == "xcomp"
|
189 |
+
]
|
190 |
+
root_before_ccomp = [
|
191 |
+
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
|
192 |
+
]
|
193 |
+
|
194 |
+
_check_for_to = [
|
195 |
+
"_".join([c.norm_, c.dep_])
|
196 |
+
for c in spanroot.subtree
|
197 |
+
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
|
198 |
+
]
|
199 |
+
entire_cl = (
|
200 |
+
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
201 |
+
)
|
202 |
|
203 |
## Start with broad category, which is then re-evaluated for specific constructions.
|
204 |
+
if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]:
|
205 |
## Adverbial clauses
|
206 |
### Finite-adverbial clauses
|
207 |
### Non-finite adverbial clauses
|
208 |
+
subjless = all(
|
209 |
+
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
210 |
+
for c in spanroot.children
|
211 |
+
)
|
212 |
+
entire_cl = (
|
213 |
+
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
214 |
+
)
|
215 |
+
|
216 |
+
if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
|
217 |
category = "Finite adverbial clause"
|
218 |
+
elif "mark" in span_dep and "aux" in span_dep:
|
219 |
category = "Finite adverbial clause"
|
220 |
|
221 |
+
elif (
|
222 |
+
"mark" in span_dep
|
223 |
+
and spanroot.pos_ in ["VERB", "AUX"]
|
224 |
+
and "expl" in c_dep
|
225 |
+
):
|
226 |
category = "Finite adverbial clause"
|
227 |
|
228 |
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
|
229 |
+
if spanroot.pos_ in ["VERB", "AUX"]:
|
230 |
category = "Finite adverbial clause"
|
231 |
|
232 |
+
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
|
233 |
category = "Non-finite adv clause 1"
|
234 |
|
235 |
elif entire_cl:
|
236 |
category = "Finite adverbial clause"
|
237 |
|
238 |
+
elif (
|
239 |
+
str(spanroot.morph)
|
240 |
+
in [
|
241 |
+
"Aspect=Prog|Tense=Pres|VerbForm=Part",
|
242 |
+
"Aspect=Perf|Tense=Past|VerbForm=Part",
|
243 |
+
]
|
244 |
+
and "aux" not in c_dep
|
245 |
+
):
|
246 |
# he doing his job
|
247 |
if argmentless:
|
248 |
+
# e.g., frankly speaking, strictly speaking
|
249 |
category = "Adverbial Phrase"
|
250 |
else:
|
251 |
category = "Non-finite adv clause 2"
|
252 |
|
253 |
+
elif (
|
254 |
+
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
|
255 |
+
):
|
256 |
category = "Non-finite adv clause 3"
|
257 |
+
|
258 |
elif "aux" in c_dep and "TO" in c_tag:
|
259 |
category = "Adverbial Phrase"
|
260 |
|
261 |
+
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
|
|
|
262 |
category = "Dependent Verb phrase"
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
+
elif not argmentless:
|
265 |
+
category = "Adverbial clause"
|
266 |
|
267 |
+
elif spanroot.dep_ == "advcl":
|
268 |
+
category = "Adverbial phrase"
|
269 |
|
270 |
+
if spanroot.dep_ in ["relcl", "ccomp", "acl"]:
|
271 |
head = spanroot.head
|
272 |
if ";" in [t.norm_ for t in head.children]:
|
273 |
category = "Main verb 3"
|
|
|
275 |
category = "Dependent verb 1"
|
276 |
elif "mark" in span_dep:
|
277 |
category = "Complement clause"
|
278 |
+
elif (
|
279 |
+
str(spanroot.morph)
|
280 |
+
in [
|
281 |
+
"Aspect=Prog|Tense=Pres|VerbForm=Part",
|
282 |
+
"Aspect=Perf|Tense=Past|VerbForm=Part",
|
283 |
+
]
|
284 |
+
and "aux" not in c_dep
|
285 |
+
):
|
286 |
category = "Non-finite complement clause"
|
287 |
+
elif spanroot.dep_ in ["relcl"]:
|
288 |
category = "Relative clause"
|
289 |
+
elif spanroot.dep_ in ["ccomp"]:
|
290 |
category = "Complement clause"
|
291 |
+
elif spanroot.dep_ in ["acl"]:
|
292 |
category = "Noun Complement clause"
|
293 |
else:
|
294 |
# print(_check_for_to)
|
|
|
296 |
|
297 |
## Specific constructions
|
298 |
# Extraposed that-clause or to-infinitives
|
299 |
+
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
|
300 |
+
"VERB",
|
301 |
+
"AUX",
|
302 |
+
]:
|
303 |
print(c_dep)
|
304 |
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
|
305 |
+
# eg it seems odd (oprd) that X.
|
306 |
+
# eg it is certain (acomp) that X.
|
307 |
+
category = (
|
308 |
+
"Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
|
309 |
+
)
|
310 |
|
311 |
elif "xcomp" in c_dep or ("advcl" in c_dep):
|
312 |
if "for_mark" in _check_for_to:
|
313 |
+
category = (
|
314 |
+
"Extraposed to-cl (explicit subj)" # eg It is possible to .
|
315 |
+
)
|
316 |
elif _check_to:
|
317 |
+
category = "Extraposed to-cl 1" # eg It is possible to .
|
318 |
elif _check_ing:
|
319 |
+
category = "Extraposed -ing 1" # eg It is possible to .
|
320 |
+
elif (
|
321 |
+
("prep" in right_dep or "npadvmod" in right_dep)
|
322 |
+
and "ccomp" in right_dep
|
323 |
+
and spanroot.lemma_ == "be"
|
324 |
+
):
|
325 |
+
category = "Cleft construction"
|
326 |
|
327 |
elif "attr" in c_dep:
|
328 |
+
category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
|
329 |
|
330 |
else:
|
331 |
+
category = "Extraposed that-cl (VERB)"
|
332 |
|
333 |
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
334 |
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
|
335 |
+
elif (
|
336 |
+
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
|
337 |
+
) and "acomp" in c_dep:
|
338 |
if "xcomp" in c_dep:
|
339 |
if _check_to:
|
340 |
+
category = "Extraposed to-cl 2" # eg it is difficult to decide.
|
341 |
elif _check_ing:
|
342 |
+
category = "Extraposed -ing 2" # eg it is difficult to decide.
|
343 |
+
|
344 |
else:
|
345 |
category = "Extraposed that-cl (adj-complement) 2"
|
346 |
|
347 |
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
|
348 |
+
category = (
|
349 |
+
"Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
|
350 |
+
)
|
351 |
|
|
|
|
|
|
|
352 |
# something without dummy subject "it"
|
353 |
+
elif (
|
354 |
+
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
|
355 |
+
and spanroot.pos_ in ["AUX", "VERB"]
|
356 |
+
and "it" not in c_norm
|
357 |
+
):
|
358 |
# store xcomp, if the head of the xcomp is acomp
|
359 |
+
_check_xcomp = [
|
360 |
+
c.dep_
|
361 |
+
for c in spanroot.subtree
|
362 |
+
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
|
363 |
+
]
|
364 |
+
_check_ccomp = [
|
365 |
+
c.dep_
|
366 |
+
for c in spanroot.subtree
|
367 |
+
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
|
368 |
+
]
|
369 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
370 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
|
|
371 |
|
372 |
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
|
373 |
if any(root_before_ccomp):
|
|
|
381 |
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
|
382 |
category = "Post-predicate to-cl"
|
383 |
|
384 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
|
385 |
category = "Subject predicate to-cl"
|
386 |
|
387 |
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
|
388 |
category = "Subject predicate to-cl (passive)"
|
389 |
|
390 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
|
391 |
category = "Subject predicate -ing"
|
392 |
elif "ccomp" in c_dep:
|
393 |
category = "Subject predicate that-cl"
|
|
|
400 |
category = "Main verb 1"
|
401 |
|
402 |
## without dummy subject it, and lexical verbs
|
403 |
+
elif (
|
404 |
+
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
|
405 |
+
and spanroot.pos_ in ["AUX", "VERB"]
|
406 |
+
and "it" not in c_norm
|
407 |
+
and spanroot.lemma_ not in ["be"]
|
408 |
+
):
|
409 |
+
_check_wh = [
|
410 |
+
c.dep_
|
411 |
+
for c in spanroot.subtree
|
412 |
+
if (
|
413 |
+
c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
|
414 |
+
and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
|
415 |
+
)
|
416 |
+
and c.head.dep_ == "ccomp"
|
417 |
+
]
|
418 |
+
_check_if = [
|
419 |
+
c.dep_
|
420 |
+
for c in spanroot.subtree
|
421 |
+
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
|
422 |
+
and c.head.dep_ == "ccomp"
|
423 |
+
]
|
424 |
|
425 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
426 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
|
|
443 |
# Existential
|
444 |
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
|
445 |
category = "There is/are NOUN"
|
|
|
|
|
|
|
446 |
|
447 |
+
elif (
|
448 |
+
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
|
449 |
+
):
|
450 |
+
category = "Cleft construction"
|
451 |
|
452 |
+
if spanroot.dep_ in ["parataxis"]:
|
453 |
+
if "_".join(span_dep) in [
|
454 |
+
"nsubj_parataxis",
|
455 |
+
"aux_parataxis",
|
456 |
+
"nsubj_aux_parataxis",
|
457 |
+
]:
|
458 |
category = "Comment clause"
|
459 |
else:
|
460 |
category = "parataxis (for now)"
|
|
|
461 |
|
462 |
## External comp
|
463 |
+
if spanroot.dep_ in ["xcomp"]:
|
464 |
+
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
|
465 |
category = "Adjective complement to-cl"
|
466 |
+
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
|
467 |
category = "Verb complement to-cl"
|
468 |
+
|
469 |
+
if spanroot.dep_ in ["pcomp"]:
|
470 |
+
if (
|
471 |
+
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
|
472 |
+
and "ccomp" in c_dep
|
473 |
+
):
|
474 |
category = "Participle + that-cl"
|
475 |
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
476 |
category = "Participle"
|
|
|
480 |
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
481 |
# category = "Gerund"
|
482 |
|
483 |
+
if spanroot.dep_ in ["neg"]:
|
484 |
category = "Negative particle"
|
485 |
+
if spanroot.dep_ in ["aux", "auxpass"]:
|
486 |
category = "Auxiliary"
|
487 |
|
488 |
# Modal verbs
|
489 |
if spanroot.tag_ == "MD":
|
490 |
category = "Modal auxiliary"
|
491 |
|
492 |
+
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
|
493 |
+
if (
|
494 |
+
spanroot.head.dep_ in ["ROOT", "ccomp"]
|
495 |
+
and spanroot.head.pos_ in ["AUX", "VERB"]
|
496 |
+
and spanroot.pos_ in ["AUX", "VERB"]
|
497 |
+
):
|
498 |
if spanroot.morph == spanroot.head.morph:
|
499 |
category = "Main verb 4"
|
500 |
else:
|
501 |
category = "Dependent verb 2"
|
502 |
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
|
503 |
category = "Gerund"
|
504 |
+
elif spanroot.head.dep_ in ["conj", "acl", "relcl"]:
|
505 |
if spanroot.morph == spanroot.head.morph:
|
506 |
category = "Main verb 4"
|
507 |
else:
|
|
|
510 |
category = "Dependent verb 2"
|
511 |
|
512 |
# Appositive phrases
|
513 |
+
if spanroot.dep_ in ["appos"]:
|
514 |
if "nummod" in c_dep:
|
515 |
category = "Apposition"
|
516 |
elif spanroot.pos_ in ["PROPN"]:
|
|
|
518 |
elif spanroot.pos_ in ["NOUN"]:
|
519 |
category = "Appositive Noun Phrase"
|
520 |
elif spanroot.pos_ in ["VERB", "AUX"]:
|
521 |
+
_check = any(
|
522 |
+
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
523 |
+
for c in spanroot.children
|
524 |
+
)
|
525 |
if _check:
|
526 |
category = "Appositive Finite-clause"
|
527 |
+
|
528 |
+
if spanroot.dep_ in ["appos", "dep", "attr"]:
|
529 |
+
if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
|
530 |
category = "Main verb 5"
|
531 |
|
532 |
if spanroot.dep_ in ["dep", "mark"]:
|
533 |
if spanroot.tag_ in ["RB", "IN", "CC"]:
|
534 |
category = "Conjunction"
|
535 |
|
536 |
+
# sometimes the extra-clausal links are not accurate
|
537 |
+
if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]:
|
|
|
538 |
if spanroot.head.dep_ == "ROOT":
|
539 |
category = "Main verb"
|
540 |
else:
|
|
|
542 |
|
543 |
if span.label_ == "CITATION":
|
544 |
if "NNP" in span_tag or "NNPS" in span_tag:
|
545 |
+
if span_dep[0] == "punct" and span_dep[-1] == "punct":
|
546 |
category = "Parenthetical Citation"
|
547 |
elif span_tag[0] in ["NNP", "NNPS"]:
|
548 |
category = "Narrative Citation"
|
|
|
565 |
span_token = [t.norm_ for t in span]
|
566 |
span_tag = [t.tag_ for t in span]
|
567 |
|
|
|
568 |
c = [c for c in spanroot.children]
|
569 |
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
|
570 |
|
|
|
575 |
|
576 |
right_dep = [c.dep_ for c in spanroot.rights]
|
577 |
|
578 |
+
# conditionals
|
579 |
+
subjless = all(
|
580 |
+
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
581 |
+
for c in spanroot.children
|
582 |
+
)
|
583 |
+
argmentless = all(
|
584 |
+
c.dep_
|
585 |
+
not in [
|
586 |
+
"nsubj",
|
587 |
+
"nsubjpass",
|
588 |
+
"csubj",
|
589 |
+
"csubjpass",
|
590 |
+
"dobj",
|
591 |
+
"ccomp",
|
592 |
+
"xcomp",
|
593 |
+
"dative",
|
594 |
+
"attr",
|
595 |
+
"oprd",
|
596 |
+
"acomp",
|
597 |
+
]
|
598 |
+
for c in spanroot.children
|
599 |
+
)
|
600 |
+
argless_span = all(
|
601 |
+
c.dep_
|
602 |
+
not in [
|
603 |
+
"nsubj",
|
604 |
+
"nsubjpass",
|
605 |
+
"csubj",
|
606 |
+
"csubjpass",
|
607 |
+
"dobj",
|
608 |
+
"ccomp",
|
609 |
+
"xcomp",
|
610 |
+
"dative",
|
611 |
+
"attr",
|
612 |
+
"oprd",
|
613 |
+
"acomp",
|
614 |
+
]
|
615 |
+
for c in span
|
616 |
+
)
|
617 |
+
argless_span = all(
|
618 |
+
c.dep_
|
619 |
+
not in [
|
620 |
+
"nsubj",
|
621 |
+
"nsubjpass",
|
622 |
+
"csubj",
|
623 |
+
"csubjpass",
|
624 |
+
"dobj",
|
625 |
+
"ccomp",
|
626 |
+
"xcomp",
|
627 |
+
"dative",
|
628 |
+
"attr",
|
629 |
+
"oprd",
|
630 |
+
"acomp",
|
631 |
+
]
|
632 |
+
for c in span
|
633 |
+
)
|
634 |
|
635 |
## nesting classifiers
|
636 |
if spanroot.dep_ == "conj":
|
637 |
+
while spanroot.dep_ == "conj":
|
638 |
spanroot = spanroot.head
|
639 |
|
640 |
if spanroot.dep_ == "poss":
|
641 |
head = spanroot.head
|
642 |
+
if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
|
643 |
category = "Posessive Noun (Object)"
|
644 |
elif head.dep_ in ["nsubj", "nsubjpass"]:
|
645 |
category = "Posessive Noun (Subject)"
|
646 |
else:
|
647 |
category = "Posessive Noun (Other)"
|
648 |
|
649 |
+
## Conjunctions
|
|
|
650 |
# Preconjunctions
|
651 |
+
if spanroot.dep_ in ["preconj", "cc"]:
|
652 |
category = "Conjunction"
|
653 |
|
654 |
## NOUN PHRASES
|
655 |
# adverbial phrases
|
656 |
+
if spanroot.dep_ in ["amod"]:
|
657 |
category = "Adjectival modifier"
|
658 |
# adverbial phrases
|
659 |
+
if spanroot.dep_ in ["compound"]:
|
660 |
category = "Compound noun"
|
661 |
|
662 |
## Nominal category
|
663 |
+
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
|
664 |
if "acl" in c_dep:
|
665 |
category = "Noun + Complement (Object)"
|
666 |
else:
|
|
|
674 |
|
675 |
## ADJUNCTS
|
676 |
# prep phrases
|
677 |
+
if spanroot.dep_ in ["prep", "agent"]:
|
678 |
+
category = "Prepositional phrase"
|
679 |
|
680 |
# adverbial phrases
|
681 |
+
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]:
|
682 |
category = "Adverbial phrase"
|
683 |
|
684 |
## Predication patterns
|
685 |
+
if spanroot.dep_ in ["acomp", "oprd"]:
|
686 |
if "xcomp" in c_dep:
|
687 |
category = "Subject predicate to-cl"
|
688 |
else:
|
689 |
category = "Adjectival complement"
|
690 |
|
691 |
+
if spanroot.dep_ in ["attr"]:
|
692 |
+
subjless = all(
|
693 |
+
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
694 |
+
for c in spanroot.children
|
695 |
+
)
|
696 |
|
697 |
c_head = [c.dep_ for c in spanroot.head.children]
|
698 |
if "expl" in c_head and "no_det" in span_t_dep_:
|
|
|
701 |
category = "There is/are + Noun complement"
|
702 |
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
|
703 |
category = "There is/are + Noun complement"
|
704 |
+
|
705 |
elif spanroot.pos_ in ["NOUN", "PRON"]:
|
706 |
if "acl" in c_dep:
|
707 |
category = "Noun + Complement (attr)"
|
708 |
else:
|
709 |
category = "Nominal complement"
|
710 |
|
711 |
+
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
|
712 |
category = "Main verb 4"
|
713 |
|
714 |
+
elif spanroot.tag_ in ["NNP"]:
|
715 |
category = "Nominal complement"
|
716 |
|
717 |
## External comp
|
718 |
+
if spanroot.dep_ in ["xcomp"]:
|
719 |
+
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
|
720 |
category = "Adjective complement to-cl"
|
721 |
+
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
|
722 |
category = "Verb complement to-cl"
|
723 |
+
|
724 |
+
if spanroot.dep_ in ["pcomp"]:
|
725 |
+
if (
|
726 |
+
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
|
727 |
+
and "ccomp" in c_dep
|
728 |
+
):
|
729 |
category = "Participle + that-cl"
|
730 |
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
731 |
category = "Participle"
|
|
|
735 |
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
736 |
# category = "Gerund"
|
737 |
|
738 |
+
if spanroot.dep_ in ["neg"]:
|
739 |
category = "Negative particle"
|
740 |
+
if spanroot.dep_ in ["aux", "auxpass"]:
|
741 |
category = "Auxiliary"
|
742 |
|
743 |
# Modal verbs
|
744 |
if spanroot.tag_ == "MD":
|
745 |
category = "Modal auxiliary"
|
746 |
|
|
|
747 |
####################################
|
748 |
### clausal ####
|
749 |
####################################
|
750 |
+
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]:
|
751 |
+
_check_to = [
|
752 |
+
c.dep_
|
753 |
+
for c in spanroot.subtree
|
754 |
+
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
|
755 |
+
and c.head.dep_ == "xcomp"
|
756 |
+
]
|
757 |
+
_check_ing = [
|
758 |
+
c.dep_
|
759 |
+
for c in spanroot.subtree
|
760 |
+
if "Prog" in str(c.morph) and c.dep_ == "xcomp"
|
761 |
+
]
|
762 |
+
root_before_ccomp = [
|
763 |
+
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
|
764 |
+
]
|
765 |
+
|
766 |
+
_check_for_to = [
|
767 |
+
"_".join([c.norm_, c.dep_])
|
768 |
+
for c in spanroot.subtree
|
769 |
+
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
|
770 |
+
]
|
771 |
+
entire_cl = (
|
772 |
+
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
773 |
+
)
|
774 |
|
775 |
## Start with broad category, which is then re-evaluated for specific constructions.
|
776 |
+
if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: #'mark',
|
777 |
## Adverbial clauses
|
778 |
+
subjless = all(
|
779 |
+
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
780 |
+
for c in spanroot.children
|
781 |
+
)
|
782 |
+
entire_cl = (
|
783 |
+
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
784 |
+
)
|
785 |
|
786 |
### Finite-adverbial clauses
|
787 |
+
if "mark" in span_dep and (
|
788 |
+
spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep
|
789 |
+
):
|
790 |
category = "Finite adverbial clause"
|
791 |
|
792 |
+
elif "mark" in span_dep and "aux" in span_dep:
|
793 |
category = "Finite adverbial clause"
|
794 |
|
795 |
+
elif (
|
796 |
+
"mark" in span_dep
|
797 |
+
and spanroot.pos_ in ["VERB", "AUX"]
|
798 |
+
and "expl" in c_dep
|
799 |
+
):
|
800 |
category = "Finite adverbial clause"
|
801 |
|
802 |
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
|
803 |
+
if spanroot.pos_ in ["VERB", "AUX"]:
|
804 |
category = "Finite adverbial clause"
|
805 |
|
806 |
+
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
|
807 |
category = "Non-finite adv clause 1"
|
808 |
|
809 |
elif not argmentless:
|
810 |
+
category = "Finite adverbial clause"
|
811 |
|
812 |
## non-finite
|
813 |
+
elif (
|
814 |
+
str(spanroot.morph)
|
815 |
+
in [
|
816 |
+
"Aspect=Prog|Tense=Pres|VerbForm=Part",
|
817 |
+
"Aspect=Perf|Tense=Past|VerbForm=Part",
|
818 |
+
]
|
819 |
+
and "aux" not in c_dep
|
820 |
+
):
|
821 |
# he doing his job
|
822 |
if argmentless:
|
823 |
+
# e.g., frankly speaking, strictly speaking
|
824 |
category = "Adverbial Phrase"
|
825 |
else:
|
826 |
category = "Non-finite adv clause 2"
|
827 |
|
828 |
+
elif (
|
829 |
+
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
|
830 |
+
):
|
831 |
category = "Non-finite adv clause 3"
|
832 |
+
|
833 |
elif "aux" in c_dep and "TO" in c_tag:
|
834 |
category = "Adverbial Phrase"
|
835 |
|
836 |
+
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
|
|
|
837 |
category = "Dependent Verb phrase"
|
838 |
+
|
839 |
elif not argmentless:
|
840 |
+
category = "Adverbial clause"
|
841 |
+
|
842 |
elif spanroot.dep_ == "advcl":
|
843 |
+
category = "Adverbial phrase"
|
844 |
+
|
845 |
else:
|
846 |
category = "Finite adverbial clause "
|
847 |
|
848 |
+
if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]:
|
|
|
849 |
head = spanroot.head
|
850 |
if ";" in [t.norm_ for t in head.children]:
|
851 |
category = "Main verb 3"
|
|
|
855 |
|
856 |
elif "mark" in span_dep:
|
857 |
category = "Complement clause"
|
858 |
+
elif (
|
859 |
+
str(spanroot.morph)
|
860 |
+
in [
|
861 |
+
"Aspect=Prog|Tense=Pres|VerbForm=Part",
|
862 |
+
"Aspect=Perf|Tense=Past|VerbForm=Part",
|
863 |
+
]
|
864 |
+
and "aux" not in c_dep
|
865 |
+
):
|
866 |
category = "Non-finite complement clause"
|
867 |
+
elif spanroot.dep_ in ["relcl"]:
|
868 |
category = "Relative clause"
|
869 |
+
elif spanroot.dep_ in ["ccomp"]:
|
870 |
category = "Complement clause"
|
871 |
+
elif spanroot.dep_ in ["acl"]:
|
872 |
category = "Noun Complement clause"
|
873 |
|
874 |
## Specific constructions
|
875 |
# Extraposed that-clause or to-infinitives
|
876 |
+
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
|
877 |
+
"VERB",
|
878 |
+
"AUX",
|
879 |
+
]:
|
880 |
# print(c_dep)
|
881 |
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
|
882 |
+
# eg it seems odd (oprd) that X.
|
883 |
+
# eg it is certain (acomp) that X.
|
884 |
+
category = (
|
885 |
+
"Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
|
886 |
+
)
|
887 |
|
888 |
elif "xcomp" in c_dep or ("advcl" in c_dep):
|
889 |
if "for_mark" in _check_for_to:
|
890 |
+
category = (
|
891 |
+
"Extraposed to-cl (explicit subj)" # eg It is possible to .
|
892 |
+
)
|
893 |
elif _check_to:
|
894 |
+
category = "Extraposed to-cl 1" # eg It is possible to .
|
895 |
elif _check_ing:
|
896 |
+
category = "Extraposed -ing 1" # eg It is possible to .
|
897 |
+
elif (
|
898 |
+
("prep" in right_dep or "npadvmod" in right_dep)
|
899 |
+
and "ccomp" in right_dep
|
900 |
+
and spanroot.lemma_ == "be"
|
901 |
+
):
|
902 |
+
category = "Cleft construction"
|
903 |
|
904 |
elif "attr" in c_dep:
|
905 |
+
category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
|
906 |
|
907 |
else:
|
908 |
+
category = "Extraposed that-cl (VERB)"
|
909 |
|
910 |
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
911 |
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
|
912 |
+
elif (
|
913 |
+
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
|
914 |
+
) and "acomp" in c_dep:
|
915 |
if "xcomp" in c_dep:
|
916 |
if _check_to:
|
917 |
+
category = "Extraposed to-cl 2" # eg it is difficult to decide.
|
918 |
elif _check_ing:
|
919 |
+
category = "Extraposed -ing 2" # eg it is difficult to decide.
|
920 |
+
|
921 |
else:
|
922 |
category = "Extraposed that-cl (adj-complement) 2"
|
923 |
|
924 |
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
|
925 |
+
category = (
|
926 |
+
"Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
|
927 |
+
)
|
928 |
|
|
|
|
|
|
|
929 |
# something without dummy subject "it"
|
930 |
+
elif (
|
931 |
+
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
|
932 |
+
and spanroot.pos_ in ["AUX", "VERB"]
|
933 |
+
and "it" not in c_norm
|
934 |
+
):
|
935 |
# store xcomp, if the head of the xcomp is acomp
|
936 |
+
_check_xcomp = [
|
937 |
+
c.dep_
|
938 |
+
for c in spanroot.subtree
|
939 |
+
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
|
940 |
+
]
|
941 |
+
_check_ccomp = [
|
942 |
+
c.dep_
|
943 |
+
for c in spanroot.subtree
|
944 |
+
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
|
945 |
+
]
|
946 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
947 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
|
|
948 |
|
949 |
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
|
950 |
if any(root_before_ccomp):
|
|
|
958 |
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
|
959 |
category = "Post-predicate to-cl"
|
960 |
|
961 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
|
962 |
category = "Subject predicate to-cl"
|
963 |
|
964 |
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
|
965 |
category = "Subject predicate to-cl (passive)"
|
966 |
|
967 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
|
968 |
category = "Subject predicate -ing"
|
969 |
elif "ccomp" in c_dep:
|
970 |
category = "Subject predicate that-cl"
|
|
|
979 |
category = "Main verb 1"
|
980 |
|
981 |
## without dummy subject it, and lexical verbs
|
982 |
+
elif (
|
983 |
+
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
|
984 |
+
and spanroot.pos_ in ["AUX", "VERB"]
|
985 |
+
and "it" not in c_norm
|
986 |
+
and spanroot.lemma_ not in ["be"]
|
987 |
+
):
|
988 |
+
_check_wh = [
|
989 |
+
c.dep_
|
990 |
+
for c in spanroot.subtree
|
991 |
+
if (
|
992 |
+
c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
|
993 |
+
and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
|
994 |
+
)
|
995 |
+
and c.head.dep_ == "ccomp"
|
996 |
+
]
|
997 |
+
_check_if = [
|
998 |
+
c.dep_
|
999 |
+
for c in spanroot.subtree
|
1000 |
+
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
|
1001 |
+
and c.head.dep_ == "ccomp"
|
1002 |
+
]
|
1003 |
|
1004 |
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
1005 |
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
|
|
1019 |
elif _check_ing:
|
1020 |
category = "Post-predicate -ing"
|
1021 |
|
|
|
|
|
1022 |
# Existential
|
1023 |
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
|
1024 |
category = "There is/are NOUN"
|
1025 |
+
|
1026 |
+
elif (
|
1027 |
+
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
|
1028 |
+
):
|
1029 |
category = "Cleft construction"
|
1030 |
|
1031 |
### The end of clausal analysis
|
1032 |
+
|
1033 |
+
if spanroot.dep_ in ["parataxis"]:
|
1034 |
+
if "_".join(span_dep) in [
|
1035 |
+
"nsubj_parataxis",
|
1036 |
+
"aux_parataxis",
|
1037 |
+
"nsubj_aux_parataxis",
|
1038 |
+
]:
|
1039 |
category = "Comment clause"
|
1040 |
else:
|
1041 |
category = "Parataxis"
|
|
|
1042 |
|
1043 |
+
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
|
1044 |
+
if (
|
1045 |
+
spanroot.head.dep_ in ["ROOT", "ccomp"]
|
1046 |
+
and spanroot.head.pos_ in ["AUX", "VERB"]
|
1047 |
+
and spanroot.pos_ in ["AUX", "VERB"]
|
1048 |
+
):
|
1049 |
if spanroot.morph == spanroot.head.morph:
|
1050 |
category = "Main verb 4"
|
1051 |
else:
|
1052 |
category = "Dependent verb 2"
|
1053 |
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
|
1054 |
category = "Gerund"
|
1055 |
+
elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
|
1056 |
+
spanroot.morph
|
1057 |
+
):
|
1058 |
category = "Dependent verb 2"
|
1059 |
+
elif spanroot.dep_ in ["csubj", "csubjpass"]:
|
1060 |
category = "Dependent verb (csubj)"
|
1061 |
|
|
|
1062 |
# Appositive phrases
|
1063 |
+
if spanroot.dep_ in ["appos"]:
|
1064 |
if "nummod" in c_dep:
|
1065 |
category = "Apposition"
|
1066 |
if spanroot.pos_ in ["PROPN"]:
|
|
|
1068 |
elif spanroot.pos_ in ["NOUN"]:
|
1069 |
category = "Appositive Noun Phrase"
|
1070 |
elif spanroot.pos_ in ["VERB", "AUX"]:
|
1071 |
+
_check = any(
|
1072 |
+
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
|
1073 |
+
for c in spanroot.children
|
1074 |
+
)
|
1075 |
if _check:
|
1076 |
category = "Appositive Finite-clause"
|
|
|
1077 |
|
1078 |
+
if spanroot.dep_ in ["appos", "dep", "attr"]:
|
1079 |
+
if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
|
1080 |
category = "Main verb (likely parsing error)"
|
1081 |
|
1082 |
+
# sometimes the dep are on the conjunctions
|
1083 |
if spanroot.dep_ in ["dep", "mark"]:
|
1084 |
if spanroot.tag_ in ["RB", "IN", "CC"]:
|
1085 |
category = "Conjunction"
|
|
|
1087 |
if spanroot.dep_ in ["intj"]:
|
1088 |
category = "Introjection"
|
1089 |
|
1090 |
+
# sometimes the extra-clausal links are not accurate
|
1091 |
+
if (
|
1092 |
+
spanroot.dep_
|
1093 |
+
in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"]
|
1094 |
+
and category == None
|
1095 |
+
):
|
1096 |
if spanroot.head.dep_ == "ROOT":
|
1097 |
category = "Main verb"
|
1098 |
else:
|
|
|
1100 |
|
1101 |
if span.label_ == "CITATION":
|
1102 |
if "NNP" in span_tag or "NNPS" in span_tag:
|
1103 |
+
if span_dep[0] == "punct" and span_dep[-1] == "punct":
|
1104 |
category = "Parenthetical Citation"
|
1105 |
elif span_tag[0] in ["NNP", "NNPS"]:
|
1106 |
category = "Narrative Citation"
|
|
|
1113 |
return category
|
1114 |
|
1115 |
|
1116 |
+
def const_table(
|
1117 |
+
doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
1118 |
+
spans_key: str = "sc",
|
1119 |
+
attrs: List[str] = SPAN_ATTRS,
|
1120 |
+
):
|
1121 |
+
columns = attrs + [
|
1122 |
+
"Conf. score",
|
1123 |
+
"sent no.",
|
1124 |
+
"grammatical realization",
|
1125 |
+
"span dep",
|
1126 |
+
"ner",
|
1127 |
+
"POS",
|
1128 |
+
"span dep seq",
|
1129 |
+
"TAG sequence",
|
1130 |
+
"POS sequence",
|
1131 |
+
"head",
|
1132 |
+
"head dep",
|
1133 |
+
"children",
|
1134 |
+
"morphology",
|
1135 |
+
"sent",
|
1136 |
+
]
|
1137 |
data = []
|
1138 |
# data = span_info_aggregator(doc, columns)
|
1139 |
sentences = {s: i for i, s in enumerate(doc.sents)}
|
1140 |
|
1141 |
+
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]):
|
|
|
1142 |
span_info = []
|
1143 |
span_info.extend([str(getattr(span, attr)) for attr in attrs])
|
1144 |
|
|
|
1154 |
span_info.append(span.root.head.norm_)
|
1155 |
span_info.append(span.root.head.dep_)
|
1156 |
span_info.append("_".join([c.dep_ for c in span.root.children]))
|
1157 |
+
span_info.append(str(span.root.morph))
|
1158 |
span_info.append(span.sent.text.strip())
|
1159 |
|
1160 |
data.append(span_info)
|
|
|
1162 |
return data, columns
|
1163 |
|
1164 |
|
1165 |
+
def ngrammar(seq: list, n=2, concat=False, sep="-"):
|
1166 |
result = []
|
1167 |
n_item = len(seq)
|
1168 |
for idx, item in enumerate(seq):
|
1169 |
if idx + n <= n_item:
|
1170 |
if concat:
|
1171 |
+
result.append(sep.join(seq[idx : idx + n]))
|
1172 |
else:
|
1173 |
+
result.append(seq[idx : idx + n])
|
1174 |
return result
|
1175 |
|
1176 |
|
1177 |
def diversity_values(count_vec: list):
|
1178 |
result = {}
|
1179 |
if len(count_vec) == 0:
|
1180 |
+
count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
1181 |
|
1182 |
+
result["shannon"] = dv.alpha.shannon(list(count_vec), base=2)
|
1183 |
+
result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec))
|
1184 |
+
result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec))
|
1185 |
+
result["simpson_e"] = dv.alpha.simpson_e(list(count_vec))
|
1186 |
# result['gini_index'] = dv.alpha.gini_index(list(count_vec))
|
1187 |
# result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
|
1188 |
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
pip==24.2
|
2 |
spacy-streamlit #==1.0.4
|
3 |
spacy>3.4.4, <3.7
|
4 |
# spacy-experimental==0.6.1
|
|
|
1 |
+
pip == 24.2
|
2 |
spacy-streamlit #==1.0.4
|
3 |
spacy>3.4.4, <3.7
|
4 |
# spacy-experimental==0.6.1
|
utils/__pycache__/visualize.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/visualize.cpython-310.pyc and b/utils/__pycache__/visualize.cpython-310.pyc differ
|
|
utils/visualize.py
CHANGED
@@ -17,7 +17,12 @@ import streamlit as st
|
|
17 |
from spacy_streamlit import visualize_spans
|
18 |
from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
|
19 |
|
20 |
-
from pipeline.post_processors import
|
|
|
|
|
|
|
|
|
|
|
21 |
from skbio import diversity as dv
|
22 |
|
23 |
SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
|
@@ -43,6 +48,9 @@ def visualize_spans(
|
|
43 |
manual: bool = False,
|
44 |
displacy_options: Optional[Dict] = None,
|
45 |
simple: bool = True,
|
|
|
|
|
|
|
46 |
):
|
47 |
"""
|
48 |
Visualizer for spans.
|
@@ -100,13 +108,15 @@ def visualize_spans(
|
|
100 |
df = pd.DataFrame(data, columns=cols)
|
101 |
df = df.astype({"start": int, "end": int})
|
102 |
df = df.sort_values(by= ['start'])
|
103 |
-
st.subheader("
|
|
|
104 |
st.dataframe(
|
105 |
df.style.highlight_between(subset='Conf. score', right=.7))
|
106 |
|
107 |
-
|
|
|
|
|
108 |
st.subheader("Label counts & Diagnostic confidence score summary")
|
109 |
-
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
|
110 |
|
111 |
print(counts)
|
112 |
print(list(counts))
|
@@ -119,7 +129,9 @@ def visualize_spans(
|
|
119 |
st.dataframe(label_counts)
|
120 |
# print(list(label_counts))
|
121 |
|
|
|
122 |
sequences = list(df['label_'])
|
|
|
123 |
# Engagement ngrams
|
124 |
span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
|
125 |
span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
|
@@ -132,20 +144,26 @@ def visualize_spans(
|
|
132 |
label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
|
133 |
st.dataframe(label_dep)
|
134 |
|
135 |
-
|
|
|
136 |
# st.markdown(
|
137 |
# f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
|
138 |
# st.markdown(
|
139 |
# f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
|
140 |
|
|
|
|
|
|
|
|
|
141 |
div = diversity_values(list(counts))
|
142 |
div_data = pd.DataFrame.from_dict(div, orient='index')
|
143 |
-
st.dataframe(div_data)
|
144 |
|
145 |
-
doc_data = pd.concat([counts,
|
146 |
filename = "NA"
|
147 |
doc_data.insert(0, "filename", filename, True)
|
148 |
doc_data.insert(1, "nwords", len(doc), True)
|
149 |
st.dataframe(doc_data)
|
|
|
150 |
# st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
|
151 |
# print(dv.get_alpha_diversity_metrics())
|
|
|
17 |
from spacy_streamlit import visualize_spans
|
18 |
from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
|
19 |
|
20 |
+
from pipeline.post_processors import (
|
21 |
+
simple_table,
|
22 |
+
const_table,
|
23 |
+
ngrammar,
|
24 |
+
diversity_values,
|
25 |
+
)
|
26 |
from skbio import diversity as dv
|
27 |
|
28 |
SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
|
|
|
48 |
manual: bool = False,
|
49 |
displacy_options: Optional[Dict] = None,
|
50 |
simple: bool = True,
|
51 |
+
show_confidence: bool = False,
|
52 |
+
show_diversity: bool = False,
|
53 |
+
show_ngrams: bool = False,
|
54 |
):
|
55 |
"""
|
56 |
Visualizer for spans.
|
|
|
108 |
df = pd.DataFrame(data, columns=cols)
|
109 |
df = df.astype({"start": int, "end": int})
|
110 |
df = df.sort_values(by= ['start'])
|
111 |
+
st.subheader("Engagement span information")
|
112 |
+
|
113 |
st.dataframe(
|
114 |
df.style.highlight_between(subset='Conf. score', right=.7))
|
115 |
|
116 |
+
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
|
117 |
+
|
118 |
+
if show_confidence:
|
119 |
st.subheader("Label counts & Diagnostic confidence score summary")
|
|
|
120 |
|
121 |
print(counts)
|
122 |
print(list(counts))
|
|
|
129 |
st.dataframe(label_counts)
|
130 |
# print(list(label_counts))
|
131 |
|
132 |
+
if show_ngrams:
|
133 |
sequences = list(df['label_'])
|
134 |
+
|
135 |
# Engagement ngrams
|
136 |
span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
|
137 |
span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
|
|
|
144 |
label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
|
145 |
st.dataframe(label_dep)
|
146 |
|
147 |
+
if show_diversity:
|
148 |
+
st.subheader('Diversity of rhetorical features')
|
149 |
# st.markdown(
|
150 |
# f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
|
151 |
# st.markdown(
|
152 |
# f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
|
153 |
|
154 |
+
st.markdown("##### Entropy based diversity measures")
|
155 |
+
|
156 |
+
filename = "NA"
|
157 |
+
|
158 |
div = diversity_values(list(counts))
|
159 |
div_data = pd.DataFrame.from_dict(div, orient='index')
|
160 |
+
# st.dataframe(div_data)
|
161 |
|
162 |
+
doc_data = pd.concat([div_data, counts, ], axis = 0).T
|
163 |
filename = "NA"
|
164 |
doc_data.insert(0, "filename", filename, True)
|
165 |
doc_data.insert(1, "nwords", len(doc), True)
|
166 |
st.dataframe(doc_data)
|
167 |
+
|
168 |
# st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
|
169 |
# print(dv.get_alpha_diversity_metrics())
|