Spaces:
Runtime error
Runtime error
File size: 10,030 Bytes
bc565d4 cb57978 bc565d4 d04bf10 32163e9 bc565d4 c0dee52 bc565d4 32163e9 bc565d4 d04bf10 c0dee52 d04bf10 c0dee52 bc565d4 d04bf10 c0dee52 bc565d4 d04bf10 c0dee52 cb57978 c0dee52 bc565d4 74782aa bc565d4 c8d67a4 d04bf10 edd6a32 c0dee52 bc565d4 c8d67a4 d04bf10 c0dee52 32163e9 96daa57 32163e9 c0dee52 bc565d4 32163e9 bc565d4 74782aa 32163e9 74782aa 32163e9 d04bf10 32163e9 d04bf10 c8d67a4 32163e9 d04bf10 96daa57 c8d67a4 96daa57 c8d67a4 c0dee52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import spacy
from spacy import displacy
import random
from spacy.tokens import Span
import gradio as gr
import pandas as pd
DEFAULT_MODEL = "en_core_web"
DEFAULT_TEXT = "Apple is looking at buying U.K. startup for $1 billion."
DEFAULT_TOK_ATTR = ['idx', 'text', 'pos_', 'lemma_', 'shape_', 'dep_']
DEFAULT_ENTS = ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY',
'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
texts = {"en": DEFAULT_TEXT, "ca": "Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars", "da": "Apple overvejer at købe et britisk startup for 1 milliard dollar.", "de": "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
"el": "Η άνιση κατανομή του πλούτου και του εισοδήματος, η οποία έχει λάβει τρομερές διαστάσεις, δεν δείχνει τάσεις βελτίωσης.", "es": "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.", "fi": "Itseajavat autot siirtävät vakuutusvastuun autojen valmistajille", "fr": "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars", "it": "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
"ja": "アップルがイギリスの新興企業を10億ドルで購入を検討", "ko": "애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.", "lt": "Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą", "nb": "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.", "nl": "Apple overweegt om voor 1 miljard een U.K. startup te kopen",
"pl": "Poczuł przyjemną woń mocnej kawy.", "pt": "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", "ro": "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", "ru": "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд", "sv": "Apple överväger att köpa brittisk startup för 1 miljard dollar.", "zh": "作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。"}
def get_all_models():
with open("requirements.txt") as f:
content = f.readlines()
models = []
for line in content:
if "huggingface.co" in line:
model = "_".join(line.split("/")[4].split("_")[:3])
if model not in models:
models.append(model)
return models
models = get_all_models()
def dependency(text, col_punct, col_phrase, compact, model):
nlp = spacy.load(model + "_sm")
doc = nlp(text)
options = {"compact": compact, "collapse_phrases": col_phrase,
"collapse_punct": col_punct}
html = displacy.render(doc, style="dep", options=options)
return html
def entity(text, ents, model):
nlp = spacy.load(model + "_sm")
doc = nlp(text)
options = {"ents": ents}
html = displacy.render(doc, style="ent", options=options)
return html
def token(text, attributes, model):
nlp = spacy.load(model + "_sm")
data = []
doc = nlp(text)
for tok in doc:
tok_data = []
for attr in attributes:
tok_data.append(getattr(tok, attr))
data.append(tok_data)
data = pd.DataFrame(data, columns=attributes)
return data
def default_token(text, attributes, model):
nlp = spacy.load(model + "_sm")
data = []
doc = nlp(text)
for tok in doc:
tok_data = []
for attr in attributes:
tok_data.append(getattr(tok, attr))
data.append(tok_data)
return data
def random_vectors(text, model):
nlp = spacy.load(model + "_md")
doc = nlp(text)
n_chunks = [chunk for chunk in doc.noun_chunks if doc.noun_chunks]
words = [tok for tok in doc if not tok.is_stop and tok.pos_ not in [
'PUNCT', "PROPN"]]
str_list = n_chunks + words
choice = random.choices(str_list, k=2)
return round(choice[0].similarity(choice[1]), 2), choice[0].text, choice[1].text
def vectors(input1, input2, model):
nlp = spacy.load(model + "_md")
return round(nlp(input1).similarity(nlp(input2)), 2)
def span(text, span1, span2, label1, label2, model):
nlp = spacy.load(model + "_sm")
doc = nlp(text)
if span1:
idx1_1 = 0
idx1_2 = 0
idx2_1 = 0
idx2_2 = 0
span1 = [split for split in span1.split(" ") if split]
span2 = [split for split in span2.split(" ") if split]
for i in range(len(list(doc))):
tok = list(doc)[i]
if span1[0] == tok.text:
idx1_1 = i
if span1[-1] == tok.text:
idx1_2 = i + 1
if span2[0] == tok.text:
idx2_1 = i
if span2[-1] == tok.text:
idx2_2 = i + 1
doc.spans["sc"] = [
Span(doc, idx1_1, idx1_2, label1),
Span(doc, idx2_1, idx2_2, label2),
]
else:
idx1_1 = 0
idx1_2 = round(len(list(doc)) / 2)
idx2_1 = 0
idx2_2 = 1
doc.spans["sc"] = [
Span(doc, idx1_1, idx1_2, label1),
Span(doc, idx2_1, idx2_2, label2),
]
html = displacy.render(doc, style="span")
return html
def get_text(model):
for i in range(len(models)):
model = model.split("_")[0]
new_text = texts[model]
return new_text
demo = gr.Blocks()
with demo:
with gr.Box():
with gr.Row():
with gr.Row():
gr.Markdown("Chose a language model")
model_input = gr.Dropdown(
choices=models, value=DEFAULT_MODEL, interactive=True, label="Pretrained Pipelines")
text_button = gr.Button("Get text in new language")
with gr.Row():
text_input = gr.Textbox(
value=DEFAULT_TEXT, interactive=True, label="Input Text")
button = gr.Button("Generate", variant="primary")
with gr.Column():
gr.Markdown("Dependency Parser")
col_punct = gr.Checkbox(label="Collapse Punctuation", value=True)
col_phrase = gr.Checkbox(label="Collapse Phrases", value=True)
compact = gr.Checkbox(label="Compact", value=False)
depen_output = gr.HTML(value=dependency(DEFAULT_TEXT, True, True, False, DEFAULT_MODEL))
dep_button = gr.Button("Generate Dependency Parser")
gr.Markdown("Entity Recognizer")
entity_input = gr.CheckboxGroup(DEFAULT_ENTS, value=DEFAULT_ENTS)
entity_output = gr.HTML(value=entity(DEFAULT_TEXT, DEFAULT_ENTS, DEFAULT_MODEL))
ent_button = gr.Button("Generate Entity Recognizer")
gr.Markdown("Token Properties")
with gr.Column():
tok_input = gr.CheckboxGroup(
DEFAULT_TOK_ATTR, value=DEFAULT_TOK_ATTR)
tok_output = gr.Dataframe(value=default_token(DEFAULT_TEXT, DEFAULT_TOK_ATTR, DEFAULT_MODEL),overflow_row_behaviour="paginate")
tok_button = gr.Button("Generate Token Properties")
gr.Markdown("Word and Phrase Similarity")
with gr.Row():
sim_text1 = gr.Textbox(
value="Apple", label="Chosen", interactive=True,)
sim_text2 = gr.Textbox(
value="U.K. startup", label="Chosen", interactive=True,)
sim_output = gr.Textbox(label="Similarity Score", value="0.12")
sim_random_button = gr.Button("Generate random words")
sim_button = gr.Button("Generate similarity")
gr.Markdown("Spans")
with gr.Column():
with gr.Row():
span1 = gr.Textbox(
label="Span 1", value="U.K. startup", placeholder="Input a part of the sentence")
label1 = gr.Textbox(value="ORG",
label="Label for Span 1")
with gr.Row():
span2 = gr.Textbox(
label="Span 2", value="U.K.", placeholder="Input another part of the sentence")
label2 = gr.Textbox(value="GPE",
label="Label for Span 2")
span_output = gr.HTML(value=span(DEFAULT_TEXT, "U.K. startup", "U.K.", "ORG", "GPE", DEFAULT_MODEL))
gr.Markdown(value="\n\n\n\n")
gr.Markdown(value="\n\n\n\n")
span_button = gr.Button("Generate spans")
text_button.click(get_text, inputs=[model_input], outputs=text_input)
button.click(dependency, inputs=[
text_input, col_punct, col_phrase, compact, model_input], outputs=depen_output)
button.click(
entity, inputs=[text_input, entity_input, model_input], outputs=entity_output)
button.click(
token, inputs=[text_input, tok_input, model_input], outputs=tok_output)
button.click(vectors, inputs=[sim_text1,
sim_text2, model_input], outputs=sim_output)
button.click(
span, inputs=[text_input, span1, span2, label1, label2, model_input], outputs=span_output)
dep_button.click(dependency, inputs=[
text_input, col_punct, col_phrase, compact, model_input], outputs=depen_output)
ent_button.click(
entity, inputs=[text_input, entity_input, model_input], outputs=entity_output)
tok_button.click(
token, inputs=[text_input, tok_input, model_input], outputs=[tok_output])
sim_button.click(vectors, inputs=[
sim_text1, sim_text2, model_input], outputs=sim_output)
span_button.click(
span, inputs=[text_input, span1, span2, label1, label2, model_input], outputs=span_output)
sim_random_button.click(random_vectors, inputs=[text_input, model_input], outputs=[
sim_output, sim_text1, sim_text2])
demo.launch()
|