Spaces:
Running
Running
from googletrans import Translator | |
import spacy | |
import gradio as gr | |
import nltk | |
from nltk.corpus import wordnet | |
nltk.download('maxent_ne_chunker') #Chunker | |
nltk.download('stopwords') #Stop Words List (Mainly Roman Languages) | |
nltk.download('words') #200 000+ Alphabetical order list | |
nltk.download('punkt') #Tokenizer | |
nltk.download('verbnet') #For Description of Verbs | |
nltk.download('omw') | |
nltk.download('omw-1.4') #Multilingual Wordnet | |
nltk.download('wordnet') #For Definitions, Antonyms and Synonyms | |
nltk.download('shakespeare') | |
nltk.download('dolch') #Sight words | |
nltk.download('names') #People Names NER | |
nltk.download('gazetteers') #Location NER | |
nltk.download('opinion_lexicon') #Sentiment words | |
spacy.cli.download("en_core_web_sm") | |
nlp = spacy.load('en_core_web_sm') | |
translator = Translator() | |
def Sentencechunker(sentence): | |
Sentchunks = sentence.split(" ") | |
chunks = [] | |
for i in range(len(Sentchunks)): | |
chunks.append(" ".join(Sentchunks[:i+1])) | |
return " | ".join(chunks) | |
def ReverseSentenceChunker(sentence): | |
reversed_sentence = " ".join(reversed(sentence.split())) | |
chunks = Sentencechunker(reversed_sentence) | |
return chunks | |
def three_words_chunk(sentence): | |
words = sentence.split() | |
chunks = [words[i:i+3] for i in range(len(words)-2)] | |
chunks = [" ".join(chunk) for chunk in chunks] | |
return " | ".join(chunks) | |
def keep_nouns_verbs(sentence): | |
doc = nlp(sentence) | |
nouns_verbs = [] | |
for token in doc: | |
if token.pos_ in ['NOUN','VERB','PUNCT']: | |
nouns_verbs.append(token.text) | |
return " ".join(nouns_verbs) | |
def unique_word_count(text="", state=None): | |
if state is None: | |
state = {} | |
words = text.split() | |
word_counts = state | |
for word in words: | |
if word in word_counts: | |
word_counts[word] += 1 | |
else: | |
word_counts[word] = 1 | |
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) | |
return sorted_word_counts, | |
def Wordchunker(word): | |
chunks = [] | |
for i in range(len(word)): | |
chunks.append(word[:i+1]) | |
return chunks | |
def BatchWordChunk(sentence): | |
words = sentence.split(" ") | |
FinalOutput = "" | |
Currentchunks = "" | |
ChunksasString = "" | |
for word in words: | |
ChunksasString = "" | |
Currentchunks = Wordchunker(word) | |
for chunk in Currentchunks: | |
ChunksasString += chunk + " " | |
FinalOutput += "\n" + ChunksasString | |
return FinalOutput | |
# Translate from English to French | |
langdest = gr.Dropdown(choices=["af", "de", "es", "ko", "ja", "zh-cn"], label="Choose Language", value="de") | |
ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks") | |
def FrontRevSentChunk (Chunkmode, Translate, Text, langdest): | |
FinalOutput = "" | |
TransFinalOutput = "" | |
if Chunkmode=="Chunks": | |
FinalOutput += Sentencechunker(Text) | |
if Chunkmode=="Reverse": | |
FinalOutput += ReverseSentenceChunker(Text) | |
if Chunkmode=="Three Word Chunks": | |
FinalOutput += three_words_chunk(Text) | |
if Chunkmode=="Spelling Chunks": | |
FinalOutput += BatchWordChunk(Text) | |
if Translate: | |
TransFinalOutput = FinalOutput | |
translated = translator.translate(TransFinalOutput, dest=langdest) | |
FinalOutput += "\n" + translated.text | |
return FinalOutput | |
def SepHypandSynExpansion(text): | |
# Tokenize the text | |
tokens = nltk.word_tokenize(text) | |
NoHits = "Words to pay special attention to: " | |
FinalOutput = "" | |
# Find synonyms and hypernyms of each word in the text | |
for token in tokens: | |
synonyms = [] | |
hypernyms = [] | |
for synset in wordnet.synsets(token): | |
synonyms += synset.lemma_names() | |
hypernyms += [hypernym.name() for hypernym in synset.hypernyms()] | |
if not synonyms and not hypernyms: | |
NoHits += f"{token} | " | |
else: | |
FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms}" | |
return NoHits, FinalOutput | |
with gr.Blocks() as lliface: | |
with gr.Tab("Welcome"): | |
gr.HTML("<h1> Spaces Test - Still Undercontruction </h1> <p> Knowledge is a Language </p> <> Arrows app json creator for easy knowledge graphing and spacy POS graph? </p> <p> https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles<br>, https://huggingface.co/spaces/vumichien/whisper-speaker-diarization<br> Maybe duplicate these, private them and then load into spaces? --> Whisper space for youtube, Clip Interrogator, load here and all my random functions esp. text to HTML </p>") | |
with gr.Tab("Transcribe - RASMUS Whisper"): | |
gr.HTML("""<a href="https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles">https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles</a>""") | |
gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles") | |
with gr.Tab("Chunks"): | |
gr.Interface(fn=FrontRevSentChunk, inputs=[ChunkModeDrop, "checkbox", "text", langdest], outputs="text") | |
gr.Interface(fn=keep_nouns_verbs, inputs=["text"], outputs="text", title="Noun and Verbs only (Plus punctuation)") | |
with gr.Tab("Unique words, Hypernyms and synonyms"): | |
gr.Interface(fn=unique_word_count, inputs="text", outputs="text", title="Wordcounter") | |
gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], title="Word suggestions") | |
with gr.Tab("Timing Practice"): | |
gr.HTML("""<iframe height="1200" style="width: 100%;" scrolling="no" title="Memorisation Aid" src="https://codepen.io/kwabs22/embed/preview/GRXKQgj?default-tab=result&editable=true" frameborder="no" loading="lazy" allowtransparency="true" allowfullscreen="true"> | |
See the Pen <a href="https://codepen.io/kwabs22/pen/GRXKQgj"> | |
Memorisation Aid</a> by kwabs22 (<a href="https://codepen.io/kwabs22">@kwabs22</a>) | |
on <a href="https://codepen.io">CodePen</a>. | |
</iframe>""") | |
lliface.launch() |