Spaces:
Running
Running
import spacy | |
import json | |
from datetime import datetime | |
import streamlit as st | |
nlp = spacy.load('la_core_web_lg') | |
def tokenize_with_spacy(text): | |
doc = nlp(text) | |
return [token.text for token in doc] | |
def annotate_text(segmented_text): | |
annotated_tokens = [] | |
for token in segmented_text: | |
doc = nlp(token) | |
annotated_token = { | |
'token': token, | |
'pos': str(doc[0].pos_), | |
'lemma': str(doc[0].lemma_), | |
'aspect': ', '.join(doc[0].morph.get("Aspect", default=[""])), | |
'tense': ', '.join(doc[0].morph.get("Tense", default=[""])), | |
'verbForm': ', '.join(doc[0].morph.get("VerbForm", default=[""])), | |
'voice': ', '.join(doc[0].morph.get("Voice", default=[""])), | |
'mood': ', '.join(doc[0].morph.get("Mood", default=[""])), | |
'number': ', '.join(doc[0].morph.get("Number", default=[""])), | |
'person': ', '.join(doc[0].morph.get("Person", default=[""])), | |
'case': ', '.join(doc[0].morph.get("Case", default=[""])), | |
'gender': ', '.join(doc[0].morph.get("Gender", default=[""])) | |
} | |
annotated_tokens.append(annotated_token) | |
return annotated_tokens | |
def save_annotations_as_json(annotated_text, filename): | |
with open(filename, 'w', encoding='utf-8') as json_file: | |
json.dump(annotated_text, json_file, ensure_ascii=False, indent=4) | |
st.title("Annotation Tool") | |
# Add Materialize CSS | |
st.markdown('<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/materialize/1.0.0/css/materialize.min.css">', unsafe_allow_html=True) | |
text = st.text_area("Text") | |
if st.button("Annotate"): | |
if text: | |
segmented_text = tokenize_with_spacy(text) | |
annotated_text = annotate_text(segmented_text) | |
st.subheader("Segmented Text:") | |
st.write(segmented_text) | |
st.subheader("Annotated Text:") | |
for token in annotated_text: | |
st.write(f"Token: {token['token']}") | |
st.write(f"POS: {token['pos']}") | |
st.write(f"Lemma: {token['lemma']}") | |
st.write(f"Aspect: {', '.join(token['aspect'])}") | |
st.write(f"Tense: {', '.join(token['tense'])}") | |
st.write(f"VerbForm: {', '.join(token['verbForm'])}") | |
st.write(f"Voice: {', '.join(token['voice'])}") | |
st.write(f"Mood: {', '.join(token['mood'])}") | |
st.write(f"Number: {', '.join(token['number'])}") | |
st.write(f"Person: {', '.join(token['person'])}") | |
st.write(f"Case: {', '.join(token['case'])}") | |
st.write(f"Gender: {', '.join(token['gender'])}") | |
st.write("-----") | |
else: | |
st.warning("Please enter some text.") | |
if st.button("Save Modifications as JSON"): | |
if annotated_text: | |
# Generate a unique file name using a timestamp | |
timestamp = datetime.now().strftime("%Y%m%d%H%M%S") | |
filename = f'annotations_{timestamp}.json' | |
save_annotations_as_json(annotated_text, filename) | |
st.success(f"Annotations saved as {filename}") | |
# Add Materialize JavaScript | |
st.markdown('<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/1.0.0/js/materialize.min.js"></script>', unsafe_allow_html=True) |