bestroi commited on
Commit
0d77127
1 Parent(s): f2ebbc4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import json
3
+ from datetime import datetime
4
+ import streamlit as st
5
+
6
+ nlp = spacy.load('la_core_web_lg')
7
+
8
+ def tokenize_with_spacy(text):
9
+ doc = nlp(text)
10
+ return [token.text for token in doc]
11
+
12
+ def annotate_text(segmented_text):
13
+ annotated_tokens = []
14
+ for token in segmented_text:
15
+ doc = nlp(token)
16
+ annotated_token = {
17
+ 'token': token,
18
+ 'pos': str(doc[0].pos_),
19
+ 'lemma': str(doc[0].lemma_),
20
+ 'aspect': ', '.join(doc[0].morph.get("Aspect", default=[""])),
21
+ 'tense': ', '.join(doc[0].morph.get("Tense", default=[""])),
22
+ 'verbForm': ', '.join(doc[0].morph.get("VerbForm", default=[""])),
23
+ 'voice': ', '.join(doc[0].morph.get("Voice", default=[""])),
24
+ 'mood': ', '.join(doc[0].morph.get("Mood", default=[""])),
25
+ 'number': ', '.join(doc[0].morph.get("Number", default=[""])),
26
+ 'person': ', '.join(doc[0].morph.get("Person", default=[""])),
27
+ 'case': ', '.join(doc[0].morph.get("Case", default=[""])),
28
+ 'gender': ', '.join(doc[0].morph.get("Gender", default=[""]))
29
+ }
30
+ annotated_tokens.append(annotated_token)
31
+ return annotated_tokens
32
+
33
+ def save_annotations_as_json(annotated_text, filename):
34
+ with open(filename, 'w', encoding='utf-8') as json_file:
35
+ json.dump(annotated_text, json_file, ensure_ascii=False, indent=4)
36
+
37
+ st.title("Annotation Tool")
38
+
39
+ # Add Materialize CSS
40
+ st.markdown('<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/materialize/1.0.0/css/materialize.min.css">', unsafe_allow_html=True)
41
+
42
+ text = st.text_area("Text")
43
+ if st.button("Annotate"):
44
+ if text:
45
+ segmented_text = tokenize_with_spacy(text)
46
+ annotated_text = annotate_text(segmented_text)
47
+ st.subheader("Segmented Text:")
48
+ st.write(segmented_text)
49
+ st.subheader("Annotated Text:")
50
+
51
+ for token in annotated_text:
52
+ st.write(f"Token: {token['token']}")
53
+ st.write(f"POS: {token['pos']}")
54
+ st.write(f"Lemma: {token['lemma']}")
55
+ st.write(f"Aspect: {', '.join(token['aspect'])}")
56
+ st.write(f"Tense: {', '.join(token['tense'])}")
57
+ st.write(f"VerbForm: {', '.join(token['verbForm'])}")
58
+ st.write(f"Voice: {', '.join(token['voice'])}")
59
+ st.write(f"Mood: {', '.join(token['mood'])}")
60
+ st.write(f"Number: {', '.join(token['number'])}")
61
+ st.write(f"Person: {', '.join(token['person'])}")
62
+ st.write(f"Case: {', '.join(token['case'])}")
63
+ st.write(f"Gender: {', '.join(token['gender'])}")
64
+ st.write("-----")
65
+
66
+ else:
67
+ st.warning("Please enter some text.")
68
+
69
+ if st.button("Save Modifications as JSON"):
70
+ if annotated_text:
71
+ # Generate a unique file name using a timestamp
72
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
73
+ filename = f'annotations_{timestamp}.json'
74
+ save_annotations_as_json(annotated_text, filename)
75
+ st.success(f"Annotations saved as {filename}")
76
+
77
+ # Add Materialize JavaScript
78
+ st.markdown('<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/1.0.0/js/materialize.min.js"></script>', unsafe_allow_html=True)