Spaces:

KwabsHug
/

Language-Learn-Idea

Running

App Files Files Community

Language-Learn-Idea / app.py

KwabsHug

Update app.py

b910e0e almost 2 years ago

raw

history blame

11 kB

	from googletrans import Translator
	import spacy
	import gradio as gr
	import nltk
	from nltk.corpus import wordnet
	import wikipedia
	import re

	nltk.download('maxent_ne_chunker') #Chunker
	nltk.download('stopwords') #Stop Words List (Mainly Roman Languages)
	nltk.download('words') #200 000+ Alphabetical order list
	nltk.download('punkt') #Tokenizer
	nltk.download('verbnet') #For Description of Verbs
	nltk.download('omw')
	nltk.download('omw-1.4') #Multilingual Wordnet
	nltk.download('wordnet') #For Definitions, Antonyms and Synonyms
	nltk.download('shakespeare')
	nltk.download('dolch') #Sight words
	nltk.download('names') #People Names NER
	nltk.download('gazetteers') #Location NER
	nltk.download('opinion_lexicon') #Sentiment words
	nltk.download('averaged_perceptron_tagger') #Parts of Speech Tagging


	spacy.cli.download("en_core_web_sm")

	nlp = spacy.load('en_core_web_sm')
	translator = Translator()

	def Sentencechunker(sentence):
	Sentchunks = sentence.split(" ")
	chunks = []
	for i in range(len(Sentchunks)):
	chunks.append(" ".join(Sentchunks[:i+1]))
	return " \| ".join(chunks)

	def ReverseSentenceChunker(sentence):
	reversed_sentence = " ".join(reversed(sentence.split()))
	chunks = Sentencechunker(reversed_sentence)
	return chunks

	def three_words_chunk(sentence):
	words = sentence.split()
	chunks = [words[i:i+3] for i in range(len(words)-2)]
	chunks = [" ".join(chunk) for chunk in chunks]
	return " \| ".join(chunks)

	def keep_nouns_verbs(sentence):
	doc = nlp(sentence)
	nouns_verbs = []
	for token in doc:
	if token.pos_ in ['NOUN','VERB','PUNCT']:
	nouns_verbs.append(token.text)
	return " ".join(nouns_verbs)

	def unique_word_count(text="", state=None):
	if state is None:
	state = {}
	words = text.split()
	word_counts = state
	for word in words:
	if word in word_counts:
	word_counts[word] += 1
	else:
	word_counts[word] = 1
	sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
	return sorted_word_counts,

	def Wordchunker(word):
	chunks = []
	for i in range(len(word)):
	chunks.append(word[:i+1])
	return chunks

	def BatchWordChunk(sentence):
	words = sentence.split(" ")
	FinalOutput = ""
	Currentchunks = ""
	ChunksasString = ""
	for word in words:
	ChunksasString = ""
	Currentchunks = Wordchunker(word)
	for chunk in Currentchunks:
	ChunksasString += chunk + " "
	FinalOutput += "\n" + ChunksasString
	return FinalOutput

	# Translate from English to French

	langdest = gr.Dropdown(choices=["af", "de", "es", "ko", "ja", "zh-cn"], label="Choose Language", value="de")

	ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks")

	def FrontRevSentChunk (Chunkmode, Translate, Text, langdest):
	FinalOutput = ""
	TransFinalOutput = ""
	if Chunkmode=="Chunks":
	FinalOutput += Sentencechunker(Text)
	if Chunkmode=="Reverse":
	FinalOutput += ReverseSentenceChunker(Text)
	if Chunkmode=="Three Word Chunks":
	FinalOutput += three_words_chunk(Text)
	if Chunkmode=="Spelling Chunks":
	FinalOutput += BatchWordChunk(Text)

	if Translate:
	TransFinalOutput = FinalOutput
	translated = translator.translate(TransFinalOutput, dest=langdest)
	FinalOutput += "\n" + translated.text
	return FinalOutput

	# Define a function to filter out non-verb, noun, or adjective words
	def filter_words(words):
	# Use NLTK to tag each word with its part of speech
	tagged_words = nltk.pos_tag(words)

	# Define a set of parts of speech to keep (verbs, nouns, adjectives)
	keep_pos = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'}

	# Filter the list to only include words with the desired parts of speech
	filtered_words = [word for word, pos in tagged_words if pos in keep_pos]

	return filtered_words

	def SepHypandSynExpansion(text):
	# Tokenize the text
	tokens = nltk.word_tokenize(text)
	NoHits = ""
	FinalOutput = ""

	# Find synonyms and hypernyms of each word in the text
	for token in tokens:
	synonyms = []
	hypernyms = []
	for synset in wordnet.synsets(token):
	synonyms += synset.lemma_names()
	hypernyms += [hypernym.name() for hypernym in synset.hypernyms()]
	if not synonyms and not hypernyms:
	NoHits += f"{token} \| "
	else:
	FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms} \n"
	NoHits = set(NoHits.split(" \| "))
	NoHits = filter_words(NoHits)
	NoHits = "Words to pay special attention to: \n" + str(NoHits)
	return NoHits, FinalOutput


	def WikiSearch(term):
	termtoks = term.split(" ")

	for item in termtoks:
	# Search for the term on Wikipedia and get the first result
	result = wikipedia.search(item, results=20)
	return result

	def find_string_positions(s, string):
	positions = []
	start = 0
	while True:
	position = s.find(string, start)
	if position == -1:
	break
	positions.append(position)
	start = position + len(string)
	return positions

	def splittext(string, split_positions):
	split_strings = []
	prepos = 0
	for pos in split_positions:
	pos -= 12
	split_strings.append((string[prepos:pos])) #, string[pos:]))
	prepos = pos

	FinalOutput = ""
	stoutput = ""
	linenumber = 1
	print(linenumber)
	for item in split_strings[1:]:
	stoutput = item[0:29] + "\n" + item[30:]
	stspaces = find_string_positions(stoutput, " ")
	FinalOutput += str(linenumber) + "\n" + stoutput[:stspaces[-2]] + "\n"
	FinalOutput += "\n"
	linenumber += 1
	return FinalOutput[2:]

	def create_dictionary(word_list, word_dict = {}):
	word_list = set(word_list.split(" "))
	for word in word_list:
	key = word[:2]
	if key not in word_dict:
	word_dict[key] = [word]
	else:
	word_dict[key].append(word)
	return word_dict

	def merge_lines(doc1, doc2, doc3, doc4):
	docs = [doc1, doc2, doc3, doc4]
	merged_lines = []

	with open("merged_lines.txt", "w") as f, \
	*map(open, docs) as handles:
	for lines in zip(*handles):
	merged_line = "".join(line.strip() for line in lines)
	f.write(merged_line + "\n")
	merged_lines.append(merged_line)

	return merged_lines

	def TTSforListeningPractice(text):
	return "not finished"

	with gr.Blocks() as lliface:
	with gr.Tab("Welcome "):
	gr.HTML("""<h1> Spaces Test - Still Undercontruction </h1> <p> You only learn when you convert things you dont know to known --> Normally Repetition is the only reliable method for everybody </p>
	<p> Knowledge is a Language but productive knowledge is find replace as well </p> <p>LingQ is good option for per word state management</p> <p> Arrows app json creator for easy knowledge graphing and spacy POS graph? </p>
	<p> Vocab = Glossary + all non text wall(lists, diagrams, etc.)</p>
	<p>https://huggingface.co/spaces/vumichien/whisper-speaker-diarization<br></p>""")
	gr.Interface(fn=unique_word_count, inputs="text", outputs="text", title="Wordcounter")
	gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], title="Word suggestions")
	gr.Interface(fn=WikiSearch, inputs="text", outputs="text", title="Unique word suggestions(wiki articles)")
	with gr.Tab("Spelling and Chunks"):
	gr.HTML("<p> Spelling is the end goal, you already know many letter orders called words so you need leverage them to remember random sequences")
	with gr.Tab("Spelling Simplification - Use a dual language list"):
	gr.Interface(fn=create_dictionary, inputs="text", outputs="text", title="Sort Text by first two letters")
	with gr.Tab("Chunks"):
	gr.Interface(fn=FrontRevSentChunk, inputs=[ChunkModeDrop, "checkbox", "text", langdest], outputs="text")
	gr.Interface(fn=keep_nouns_verbs, inputs=["text"], outputs="text", title="Noun and Verbs only (Plus punctuation)")
	with gr.Tab("Timing Practice - Repitition"):
	gr.HTML("<p>Run from it, Dread it, Repitition is inevitable - Thanos</p> <p>Next Milestone is Turning this interface handsfree</p>")
	gr.HTML("""<iframe height="1200" style="width: 100%;" scrolling="no" title="Memorisation Aid" src="https://codepen.io/kwabs22/embed/preview/GRXKQgj?default-tab=result&editable=true" frameborder="no" loading="lazy" allowtransparency="true" allowfullscreen="true">
	See the Pen <a href="https://codepen.io/kwabs22/pen/GRXKQgj">
	Memorisation Aid</a> by kwabs22 (<a href="https://codepen.io/kwabs22">@kwabs22</a>)
	on <a href="https://codepen.io">CodePen</a>.
	</iframe>""")
	with gr.Tab("Knowledge Ideas"):
	gr.HTML("""<p>Good knowledge = ability to answer questions --> find Questions you cant answer and look for hidden answer within them </p>
	<p>My One Word Theory = We only use more words than needed when we have to or are bored --> Headings exist because title is not sufficient, subheadings exist because headings are not sufficient, Book Text exists because subheadings are not sufficient</p>
	<p>Big Picture = Expand the Heading and the subheadings and compare them to each other</p>
	<p>Application of Knowledge = App Version of the text (eg. Jupyter Notebooks) is what you create and learn first</p>
	""")
	with gr.Tab("Beginner - Songs - Chorus"):
	gr.HTML("Essentially if the sounds are repeated or long notes they are easy to remember")
	gr.Interface(fn=TTSforListeningPractice, inputs="text", outputs="text", title="Placeholder - paste chorus here and use TTS or make notes to save here")
	with gr.Tab("Transcribe - RASMUS Whisper"):
	gr.HTML("""<p>If this tab doesnt work use the link below ⬇️</p> <a href="https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles">https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles</a>""")
	gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles")
	with gr.Tab("Advanced - LingQ Addons ideas"):
	gr.HTML("Extra functions needed - Persitent Sentence translation, UNWFWO, POS tagging and Word Count per user of words in their account. Macaronic Text is also another way to practice only the important information")
	with gr.Row():
	RomanFile = gr.File(label="Paste Roman")
	W4WFile = gr.File(label="Paste Word 4 Word")
	FullMeanFile = gr.File(label="Paste Full Meaning")
	MacaronicFile = gr.File(label="Paste Macaronic Text")
	with gr.Row():
	MergeButton = gr.Button()
	with gr.Row():
	MergeOutput = gr.TextArea(label="Output")
	MergeButton.click(merge_lines, inputs=[RomanFile, W4WFile, FullMeanFile, MacaronicFile], outputs=[MergeOutput])
	with gr.Tab("Dictionary from text"):
	gr.Interface(fn=create_dictionary, inputs="text", outputs="text", title="Two Letter Dictionary")


	lliface.launch()