Spaces:
Running
Running
from googletrans import Translator | |
import spacy | |
import gradio as gr | |
import nltk | |
from nltk.corpus import wordnet | |
import wikipedia | |
import re | |
import time | |
import random | |
import os | |
import zipfile | |
import gradio as gr | |
import ffmpeg | |
nltk.download('maxent_ne_chunker') #Chunker | |
nltk.download('stopwords') #Stop Words List (Mainly Roman Languages) | |
nltk.download('words') #200 000+ Alphabetical order list | |
nltk.download('punkt') #Tokenizer | |
nltk.download('verbnet') #For Description of Verbs | |
nltk.download('omw') | |
nltk.download('omw-1.4') #Multilingual Wordnet | |
nltk.download('wordnet') #For Definitions, Antonyms and Synonyms | |
nltk.download('shakespeare') | |
nltk.download('dolch') #Sight words | |
nltk.download('names') #People Names NER | |
nltk.download('gazetteers') #Location NER | |
nltk.download('opinion_lexicon') #Sentiment words | |
nltk.download('averaged_perceptron_tagger') #Parts of Speech Tagging | |
spacy.cli.download("en_core_web_sm") | |
nlp = spacy.load('en_core_web_sm') | |
translator = Translator() | |
def Sentencechunker(sentence): | |
Sentchunks = sentence.split(" ") | |
chunks = [] | |
for i in range(len(Sentchunks)): | |
chunks.append(" ".join(Sentchunks[:i+1])) | |
return " | ".join(chunks) | |
def ReverseSentenceChunker(sentence): | |
reversed_sentence = " ".join(reversed(sentence.split())) | |
chunks = Sentencechunker(reversed_sentence) | |
return chunks | |
def three_words_chunk(sentence): | |
words = sentence.split() | |
chunks = [words[i:i+3] for i in range(len(words)-2)] | |
chunks = [" ".join(chunk) for chunk in chunks] | |
return " | ".join(chunks) | |
def keep_nouns_verbs(sentence): | |
doc = nlp(sentence) | |
nouns_verbs = [] | |
for token in doc: | |
if token.pos_ in ['NOUN','VERB','PUNCT']: | |
nouns_verbs.append(token.text) | |
return " ".join(nouns_verbs) | |
def unique_word_count(text="", state=None): | |
if state is None: | |
state = {} | |
words = text.split() | |
word_counts = state | |
for word in words: | |
if word in word_counts: | |
word_counts[word] += 1 | |
else: | |
word_counts[word] = 1 | |
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) | |
return sorted_word_counts, | |
def Wordchunker(word): | |
chunks = [] | |
for i in range(len(word)): | |
chunks.append(word[:i+1]) | |
return chunks | |
def BatchWordChunk(sentence): | |
words = sentence.split(" ") | |
FinalOutput = "" | |
Currentchunks = "" | |
ChunksasString = "" | |
for word in words: | |
ChunksasString = "" | |
Currentchunks = Wordchunker(word) | |
for chunk in Currentchunks: | |
ChunksasString += chunk + " " | |
FinalOutput += "\n" + ChunksasString | |
return FinalOutput | |
# Translate from English to French | |
langdest = gr.Dropdown(choices=["af", "de", "es", "ko", "ja", "zh-cn"], label="Choose Language", value="de") | |
ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks") | |
def FrontRevSentChunk (Chunkmode, Translate, Text, langdest): | |
FinalOutput = "" | |
TransFinalOutput = "" | |
if Chunkmode=="Chunks": | |
FinalOutput += Sentencechunker(Text) | |
if Chunkmode=="Reverse": | |
FinalOutput += ReverseSentenceChunker(Text) | |
if Chunkmode=="Three Word Chunks": | |
FinalOutput += three_words_chunk(Text) | |
if Chunkmode=="Spelling Chunks": | |
FinalOutput += BatchWordChunk(Text) | |
if Translate: | |
TransFinalOutput = FinalOutput | |
translated = translator.translate(TransFinalOutput, dest=langdest) | |
FinalOutput += "\n" + translated.text | |
return FinalOutput | |
# Define a function to filter out non-verb, noun, or adjective words | |
def filter_words(words): | |
# Use NLTK to tag each word with its part of speech | |
tagged_words = nltk.pos_tag(words) | |
# Define a set of parts of speech to keep (verbs, nouns, adjectives) | |
keep_pos = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'} | |
# Filter the list to only include words with the desired parts of speech | |
filtered_words = [word for word, pos in tagged_words if pos in keep_pos] | |
return filtered_words | |
def SepHypandSynExpansion(text): | |
# Tokenize the text | |
tokens = nltk.word_tokenize(text) | |
NoHits = "" | |
FinalOutput = "" | |
# Find synonyms and hypernyms of each word in the text | |
for token in tokens: | |
synonyms = [] | |
hypernyms = [] | |
for synset in wordnet.synsets(token): | |
synonyms += synset.lemma_names() | |
hypernyms += [hypernym.name() for hypernym in synset.hypernyms()] | |
if not synonyms and not hypernyms: | |
NoHits += f"{token} | " | |
else: | |
FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms} \n" | |
NoHits = set(NoHits.split(" | ")) | |
NoHits = filter_words(NoHits) | |
NoHits = "Words to pay special attention to: \n" + str(NoHits) | |
return NoHits, FinalOutput | |
def WikiSearch(term): | |
termtoks = term.split(" ") | |
for item in termtoks: | |
# Search for the term on Wikipedia and get the first result | |
result = wikipedia.search(item, results=20) | |
return result | |
def create_dictionary(word_list, word_dict = {}): | |
word_list = set(word_list.split(" ")) | |
for word in word_list: | |
key = word[:2] | |
if key not in word_dict: | |
word_dict[key] = [word] | |
else: | |
word_dict[key].append(word) | |
return word_dict | |
def merge_lines(roman_file, w4w_file, full_mean_file, macaronic_file): | |
files = [roman_file, w4w_file, full_mean_file, macaronic_file] | |
merged_lines = [] | |
with open(roman_file.name, "r") as f1, open(w4w_file.name, "r") as f2, \ | |
open(full_mean_file.name, "r") as f3, open(macaronic_file.name, "r") as f4: | |
for lines in zip(f1, f2, f3, f4): | |
merged_line = "\n".join(line.strip() for line in lines) | |
merged_lines.append(merged_line) | |
return "\n".join(merged_lines) | |
def TTSforListeningPractice(text): | |
return "not finished" | |
def group_words(inlist): | |
inlisttoks = inlist.split(" ") | |
inlistset = set(inlisttoks) | |
word_groups = [] | |
current_group = [] | |
for word in inlisttoks: | |
current_group.append(word) | |
if len(current_group) == 10: | |
word_groups.append(current_group) | |
current_group = [] | |
if current_group: | |
word_groups.append(current_group) | |
current_group_index = 0 | |
current_group_time = 0 | |
while True: | |
if current_group_time == 60: | |
current_group_index = (current_group_index + 1) % len(word_groups) | |
current_group_time = 0 | |
else: | |
if current_group_time % 10 == 0: | |
random.shuffle(word_groups[current_group_index]) | |
current_group_time += 10 | |
yield " ".join(word_groups[current_group_index]) | |
time.sleep(10) | |
def split_verbs_nouns(text): | |
nlp = spacy.load("en_core_web_sm") | |
doc = nlp(text) | |
verbs_nouns = [] | |
other_words = [] | |
for token in doc: | |
if token.pos_ in ["VERB", "NOUN"]: | |
verbs_nouns.append(token.text) | |
elif token.text in [punct.text for punct in doc if punct.is_punct]: | |
verbs_nouns.append(token.text) | |
other_words.append(token.text) | |
else: | |
other_words.append(token.text) | |
verbs_nouns_text = " ".join(verbs_nouns) | |
other_words_text = " ".join(other_words) | |
return verbs_nouns_text, other_words_text | |
def split_srt_file(text): #file_path): | |
# Open the SRT file and read its contents | |
#with open(file_path, 'r') as f: | |
# srt_contents = f.read() | |
srt_contents = text | |
# Split the SRT file by timestamp | |
srt_sections = srt_contents.split('\n\n') | |
# Loop through each section of the SRT file | |
for i in range(len(srt_sections)): | |
# Split the section into its timestamp and subtitle text | |
section_lines = srt_sections[i].split('\n') | |
timestamp = section_lines[1] | |
subtitle_text = ' | '.join(section_lines[2:]) | |
# Replace spaces in the subtitle text with " | " | |
subtitle_text = subtitle_text.replace(' ', ' | ') | |
# Reconstruct the section with the updated subtitle text | |
srt_sections[i] = f"{section_lines[0]}\n{timestamp}\n{subtitle_text[3:]}" | |
# Join the SRT sections back together into a single string | |
return '\n\n'.join(srt_sections) | |
def find_string_positions(s, string): | |
positions = [] | |
start = 0 | |
while True: | |
position = s.find(string, start) | |
if position == -1: | |
break | |
positions.append(position) | |
start = position + len(string) | |
return positions | |
def splittext(string): | |
split_positions = find_string_positions(string, " --> ") | |
split_strings = [] | |
prepos = 0 | |
for pos in split_positions: | |
pos -= 12 | |
split_strings.append((string[prepos:pos])) #, string[pos:])) | |
prepos = pos | |
FinalOutput = "" | |
stoutput = "" | |
linenumber = 1 | |
print(linenumber) | |
for item in split_strings[1:]: | |
stoutput = item[0:29] + "\n" + item[30:] | |
stspaces = find_string_positions(stoutput, " ") | |
FinalOutput += str(linenumber) + "\n" + stoutput[:stspaces[-2]] + "\n" | |
FinalOutput += "\n" | |
linenumber += 1 | |
return FinalOutput[2:] | |
def VideotoSegment(video_file, subtitle_file): | |
# Read the subtitle file and extract the timings for each subtitle | |
timings = [] | |
for line in subtitle_file: | |
if '-->' in line: | |
start, end = line.split('-->') | |
start_time = start.strip().replace(',', '.') | |
end_time = end.strip().replace(',', '.') | |
timings.append((start_time, end_time)) | |
# Cut the video into segments based on the subtitle timings | |
video_segments = [] | |
for i, (start_time, end_time) in enumerate(timings): | |
output_file = f'segment_{i}.mp4' | |
ffmpeg.input(video_file, ss=start_time, to=end_time).output(output_file, codec='copy').run() | |
video_segments.append(output_file) | |
# Convert each segment to an MP3 audio file using FFmpeg | |
audio_segments = [] | |
for i in range(len(timings)): | |
output_file = f'segment_{i}.mp3' | |
ffmpeg.input(video_segments[i]).output(output_file, codec='libmp3lame', qscale='4').run() | |
audio_segments.append(output_file) | |
# Create a ZIP archive containing all of the segmented files | |
zip_file = zipfile.ZipFile('segmented_files.zip', 'w') | |
for segment in video_segments + audio_segments: | |
zip_file.write(segment) | |
os.remove(segment) | |
zip_file.close() | |
# Return the ZIP archive for download | |
return 'segmented_files.zip' | |
# Define the Gradio interface inputs and outputs for video split | |
spvvideo_file_input = gr.File(label='Video File') | |
spvsubtitle_file_input = gr.File(label='Subtitle File') | |
spvdownload_output = gr.File(label='Download Segmented Files') | |
groupinput_text = gr.inputs.Textbox(lines=2, label="Enter a list of words") | |
groupoutput_text = gr.outputs.Textbox(label="Grouped words") | |
with gr.Blocks() as lliface: | |
with gr.Tab("Welcome"): | |
gr.HTML("""<h1> Spaces Test - Still Undercontruction </h1> <p> You only learn when you convert things you dont know to known --> Normally Repetition is the only reliable method for everybody </p> | |
<p> Knowledge is a Language but productive knowledge is find replace as well </p> <p>LingQ is good option for per word state management</p> <p> Arrows app json creator for easy knowledge graphing and spacy POS graph? </p> | |
<p> Vocab = Glossary + all non text wall(lists, diagrams, etc.)</p> | |
<p> https://huggingface.co/spaces/vumichien/whisper-speaker-diarization<br></p> | |
<p> In Language the goal is bigger vocab --> Knowledge equivalent = question answer pairs but to get to those you need related information pairs</p> | |
<p> ChatGPT Turns Learning into a read only what you dont know ask only what you dont know feedback loop --> All you have to do is keep track of what prompts you have asked in the past</p> | |
<p> Spell multiple words simultaneously for simultaneous access </p> | |
""") | |
with gr.Tab("Unique word ID"): | |
gr.Interface(fn=unique_word_count, inputs="text", outputs="text", title="Wordcounter") | |
gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], title="Word suggestions") | |
gr.Interface(fn=WikiSearch, inputs="text", outputs="text", title="Unique word suggestions(wiki articles)") | |
with gr.Tab("Automating related information linking"): | |
gr.HTML("Questions - Tacking and suggesting questions to ask = new education") | |
with gr.Tab("Spelling and Chunks"): | |
gr.Text("Merged Spelling Practice Placeholder - Spell multiple words simultaneously for simultaneous access") | |
gr.HTML("<p> Spelling is the end goal, you already know many letter orders called words so you need leverage them to remember random sequences") | |
with gr.Tab("Spelling Simplification - Use a dual language list"): | |
gr.Interface(fn=create_dictionary, inputs="text", outputs="text", title="Sort Text by first two letters") | |
with gr.Tab("Chunks"): | |
gr.Interface(fn=FrontRevSentChunk, inputs=[ChunkModeDrop, "checkbox", "text", langdest], outputs="text") | |
gr.Interface(fn=keep_nouns_verbs, inputs=["text"], outputs="text", title="Noun and Verbs only (Plus punctuation)") | |
with gr.Tab("Timing Practice - Repitition"): | |
gr.HTML("<p>Run from it, Dread it, Repitition is inevitable - Thanos</p> <p>Next Milestone is Turning this interface handsfree</p>") | |
with gr.Tab("Gradio Version"): | |
gr.Interface(fn=group_words, inputs=groupinput_text, outputs=groupoutput_text, title="Word Grouping and Rotation", description="Group a list of words into sets of 10 and rotate them every 60 seconds.").queue() | |
with gr.Tab("HTML Version"): | |
gr.HTML("""<iframe height="1200" style="width: 100%;" scrolling="no" title="Memorisation Aid" src="https://codepen.io/kwabs22/embed/preview/GRXKQgj?default-tab=result&editable=true" frameborder="no" loading="lazy" allowtransparency="true" allowfullscreen="true"> | |
See the Pen <a href="https://codepen.io/kwabs22/pen/GRXKQgj"> | |
Memorisation Aid</a> by kwabs22 (<a href="https://codepen.io/kwabs22">@kwabs22</a>) | |
on <a href="https://codepen.io">CodePen</a>. | |
</iframe>""") | |
with gr.Tab("Knowledge Ideas"): | |
gr.HTML("""<p>Good knowledge = ability to answer questions --> find Questions you cant answer and look for hidden answer within them </p> | |
<p>My One Word Theory = We only use more words than needed when we have to or are bored --> Headings exist because title is not sufficient, subheadings exist because headings are not sufficient, Book Text exists because subheadings are not sufficient</p> | |
<p>Big Picture = Expand the Heading and the subheadings and compare them to each other</p> | |
<p>Application of Knowledge = App Version of the text (eg. Jupyter Notebooks) is what you create and learn first</p> | |
""") | |
with gr.Tab("Beginner - Songs - Chorus"): | |
gr.HTML("Essentially if the sounds are repeated or long notes they are easy to remember") | |
gr.Interface(fn=TTSforListeningPractice, inputs="text", outputs="text", title="Placeholder - paste chorus here and use TTS or make notes to save here") | |
with gr.Tab("Transcribe - RASMUS Whisper"): | |
gr.HTML("""<p>If this tab doesnt work use the link below ⬇️</p> <a href="https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles">https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles</a>""") | |
gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles") | |
with gr.Tab("Advanced - LingQ Addons ideas"): | |
gr.HTML("Extra functions needed - Persitent Sentence translation, UNWFWO, POS tagging and Word Count per user of words in their account. Macaronic Text is also another way to practice only the important information") | |
with gr.Tab("Merged Subtitles"): | |
gr.HTML("SRT Contents to W4W Split SRT for Google Translate") | |
gr.Interface(fn=split_srt_file, inputs="text", outputs="text") | |
gr.HTML("Text for w4w creation in G Translate") | |
gr.Interface(fn=splittext, inputs="text", outputs="text") | |
with gr.Row(): | |
RomanFile = gr.File(label="Paste Roman") | |
W4WFile = gr.File(label="Paste Word 4 Word") | |
FullMeanFile = gr.File(label="Paste Full Meaning") | |
MacaronicFile = gr.File(label="Paste Macaronic Text") | |
SentGramFormula = gr.File(label="Paste Sentence Grammar Formula Text") | |
with gr.Row(): | |
MergeButton = gr.Button() | |
with gr.Row(): | |
MergeOutput = gr.TextArea(label="Output") | |
MergeButton.click(merge_lines, inputs=[RomanFile, W4WFile, FullMeanFile, MacaronicFile], outputs=[MergeOutput]) | |
with gr.Tab("Split video to segments"): | |
gr.Interface(VideotoSegment, inputs=[spvvideo_file_input, spvsubtitle_file_input], outputs=spvdownload_output) | |
with gr.Tab("Sentence to Format"): | |
gr.Interface(fn=split_verbs_nouns , inputs="text", outputs=["text", "text"], title="Comprehension reading and Sentence Format Creator") | |
gr.Text("Text to Closed Class + Adjectives + Punctuation or Noun Verb + Punctuation ") | |
with gr.Tab("Dictionary from text"): | |
gr.Interface(fn=create_dictionary, inputs="text", outputs="text", title="Two Letter Dictionary") | |
lliface.launch() |