JavierGon12 commited on
Commit
d8e07ba
1 Parent(s): 86165e8

Insert all files

Browse files
app.py CHANGED
@@ -1,17 +1,89 @@
 
 
1
  import streamlit as st
2
- from diffusers import DDPMScheduler, UNet2DModel
3
  from PIL import Image
4
- import torch
5
- import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- import torch
8
- from diffusers import StableDiffusionPipeline
9
 
10
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
11
- pipe = pipe.to("cuda")
12
 
13
- prompt = st.text_input('Insert here your prompt')
14
 
15
- image = pipe(prompt).images[0]
16
 
17
 
 
1
+ # Install libraries
2
+
3
  import streamlit as st
 
4
  from PIL import Image
5
+ import streamlit as st
6
+ from transformers import pipeline
7
+ import pandas as pd
8
+ import plotly.express as px
9
+ import matplotlib.pyplot as plt
10
+ from pathlib import Path
11
+ import base64
12
+ from st_pages import Page, add_page_title, show_pages
13
+ from streamlit_extras.badges import badge
14
+
15
+
16
+ # Config
17
+ # Initial page config
18
+
19
+ st.set_page_config(
20
+ page_title='RetrAIced',
21
+ page_icon=':🧠:',
22
+ layout="wide",
23
+ initial_sidebar_state="expanded",
24
+ )
25
+
26
+ def local_css(file_name):
27
+ with open(file_name) as f:
28
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
29
+
30
+ local_css("style.css")
31
+
32
+
33
+ def img_to_bytes(img_path):
34
+ img_bytes = Path(img_path).read_bytes()
35
+ encoded = base64.b64encode(img_bytes).decode()
36
+ return encoded
37
+
38
+
39
+ show_pages(
40
+ [
41
+ Page("app.py", "Home", "🏠"),
42
+ Page("pages/Question Answering.py", "Question Answering", ":grey_question:"),
43
+ Page("pages/Speech Recognition.py", "Speech Recognition", ":speaking_head_in_silhouette:"),
44
+ Page("pages/Summarization.py", "Summarization",":bookmark_tabs:"),
45
+ Page("pages/Text to Image.py", "Text to Image",":lower_left_paintbrush:"),
46
+ Page("pages/Text Classification.py",'Text Classification',":book:"),
47
+ Page("pages/Image to text.py","Image to Text",":camera:"),
48
+ Page("pages/Text Generation.py", "Text Generation", ":printer:"),
49
+ ]
50
+ )
51
+
52
+ #Add streamlit logo
53
+
54
+ st.image("logo retraced 2.png")
55
+ st.header("Intro")
56
+ st.write("##")
57
+ st.markdown(
58
+ """
59
+ Welcome to **RetrAIced**, a user-friendly app that unifies a diverse array of AI models, offering a seamless platform for exploration and interaction. From natural language processing to image recognition,
60
+ the app provides a comprehensive experience, showcasing real-time demonstrations of predictive analytics and the fusion of various AI technologies. \n
61
+
62
+ Language models (LLMs), especially those from Hugging Face, have transformed natural language understanding and generation, becoming indispensable in today's data-driven world.
63
+ RetrAIced exemplifies the collaborative potential of AI by breaking down barriers between different models, making their collective power accessible to users of all backgrounds.
64
+ The app invites developers, data enthusiasts, and the curious to explore and experiment with models for tasks like Question Answering, Speech Recognition, Summarization, Text
65
+ Classification, and Text Generation. This unified experience paves the way for a connected and intelligent digital world, where projects can become more versatile, efficient, and engaging.\n
66
+
67
+ Join the creator on an exciting journey into the world of language models through RetrAIced, unlocking a universe of possibilities and transforming complexities into a unified and intuitive AI experience.
68
+
69
+ """
70
+ , unsafe_allow_html=True)
71
+
72
+
73
+ st.write("##")
74
+ st.write("##")
75
+
76
+ #Create 2 columns to add github repo and huggging face repo
77
+ left_col, right_col = st.columns(2)
78
+
79
+ with left_col:
80
+ st.info('**Hugging Face: [@JavierGon12](https://huggingface.co/JavierGon12)**', icon="💡")
81
 
82
+ with right_col:
 
83
 
84
+ badge(type='github',name='JaviGon12')
85
+ #st.info('**GitHub: [@JaviGon12](https://github.com/JaviGon12)**', icon="💻")
86
 
 
87
 
 
88
 
89
 
logo retraced 2.png ADDED
pages/Image to text.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
2
+ import requests
3
+ from PIL import Image
4
+ import streamlit as st
5
+
6
+ processor = Pix2StructProcessor.from_pretrained('google/deplot')
7
+ model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
8
+
9
+ document = st.file_uploader(label="Upload the document you want to explore",type=["png",'jpg', "jpeg","pdf"])
10
+
11
+ if document == None:
12
+ st.write("Please upload the document in the box above")
13
+ else:
14
+ image = Image.open(document)
15
+ st.image(image,"Document uploaded")
16
+
17
+ inputs = processor(images=image, text="Generate underlying data table of the figure below:", return_tensors="pt")
18
+ predictions = model.generate(**inputs, max_new_tokens=512)
19
+ st.write(processor.decode(predictions[0], skip_special_tokens=True))
pages/Question Answering.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import streamlit as st
3
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
4
+ from datasets import load_dataset
5
+ import torch
6
+ import os
7
+ from PIL import Image
8
+ import PyPDF2
9
+ from pypdf.errors import PdfReadError
10
+ from pypdf import PdfReader
11
+ import pypdfium2 as pdfium
12
+ processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
13
+ model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
14
+
15
+ device ="cpu"
16
+ model.to(device)
17
+
18
+ #create uploader
19
+ document = st.file_uploader(label="Upload the document you want to explore",type=["png",'jpg', "jpeg","pdf"])
20
+
21
+ question = st.text_input(str("Insert here you question?"))
22
+
23
+ if document == None:
24
+ st.write("Please upload the document in the box above")
25
+ else:
26
+ try:
27
+ PdfReader(document)
28
+ pdf = pdfium.PdfDocument(document)
29
+ page = pdf.get_page(0)
30
+ pil_image = page.render(scale = 300/72).to_pil()
31
+ #st.image(pil_image, caption="Document uploaded", use_column_width=True)
32
+ task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
33
+ #question = "What's the total amount?"
34
+ prompt = task_prompt.replace("{user_input}", question)
35
+ decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
36
+ pixel_values = processor(pil_image, return_tensors="pt").pixel_values
37
+ outputs = model.generate(
38
+ pixel_values.to(device),
39
+ decoder_input_ids=decoder_input_ids.to(device),
40
+ max_length=model.decoder.config.max_position_embeddings,
41
+ pad_token_id=processor.tokenizer.pad_token_id,
42
+ eos_token_id=processor.tokenizer.eos_token_id,
43
+ use_cache=True,
44
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
45
+ return_dict_in_generate=True,
46
+ )
47
+ sequence = processor.batch_decode(outputs.sequences)[0]
48
+ sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
49
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
50
+ st.image(pil_image,"Document uploaded")
51
+ st.write(processor.token2json(sequence))
52
+ print(processor.token2json(sequence))
53
+
54
+
55
+ except PdfReadError:
56
+ #image = Image.open(document)
57
+ #st.image(document, caption="Document uploaded", use_column_width=False)
58
+ # prepare decoder inputs
59
+ document = Image.open(document)
60
+
61
+ task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
62
+ #question = "What's the total amount?"
63
+ prompt = task_prompt.replace("{user_input}", question)
64
+ decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
65
+ pixel_values = processor(document, return_tensors="pt").pixel_values
66
+
67
+ outputs = model.generate(
68
+ pixel_values.to(device),
69
+ decoder_input_ids=decoder_input_ids.to(device),
70
+ max_length=model.decoder.config.max_position_embeddings,
71
+ pad_token_id=processor.tokenizer.pad_token_id,
72
+ eos_token_id=processor.tokenizer.eos_token_id,
73
+ use_cache=True,
74
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
75
+ return_dict_in_generate=True,
76
+ )
77
+ sequence = processor.batch_decode(outputs.sequences)[0]
78
+ sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
79
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
80
+ st.image(document,"Document uploaded")
81
+ st.write(processor.token2json(sequence))
82
+
83
+
84
+
85
+
pages/Speech Recognition.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BartForConditionalGeneration, BartTokenizer
2
+ import streamlit as st
3
+ import torch
4
+ from transformers import AutoProcessor, WhisperForConditionalGeneration
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ import torchaudio
7
+ from transformers import pipeline
8
+ from streamlit_mic_recorder import mic_recorder,speech_to_text
9
+ import numpy as np
10
+
11
+
12
+ option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file"))
13
+ if option == "Microphone":
14
+ # Load your own audio file
15
+ st.write("Record your voice, and play the recorded audio:")
16
+ audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder')
17
+
18
+ if audio == None:
19
+ st.write("Please start the recording in the box above")
20
+ else:
21
+ st.audio(audio["bytes"])
22
+
23
+ elif option == "Upload file":
24
+ audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
25
+ if audio:
26
+ st.audio(audio)
27
+
28
+ option_language = st.selectbox(
29
+ 'Select the language of your audio',
30
+ ('English', 'Spanish', 'German','French','Chinese'))
31
+
32
+
33
+ if audio == None:
34
+ st.write("Please upload the audio in the box above")
35
+
36
+
37
+ else:
38
+ if option_language == "English":
39
+ def transcribe_audio(audio_file):
40
+ # Load the audio file
41
+ waveform, sample_rate = torchaudio.load(audio_file)
42
+
43
+ # Ensure mono-channel audio
44
+ if waveform.shape[0] > 1:
45
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
46
+
47
+ # Convert to a 16kHz sample rate if not already
48
+ if sample_rate != 16000:
49
+ waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
50
+
51
+ # Convert to a list of integers
52
+ audio_input = waveform.squeeze().numpy().astype(int).tolist()
53
+
54
+ # Use Hugging Face's ASR pipeline
55
+ asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
56
+
57
+ # Transcribe the audio
58
+ transcript = asr_pipeline(waveform.numpy()[0])
59
+
60
+ return transcript
61
+
62
+ transcription = transcribe_audio(audio)
63
+ st.write("Here is your transcription:")
64
+ st.write(transcription)
65
+
66
+ elif option_language == 'Spanish':
67
+
68
+ def transcribe_audio(audio_file):
69
+
70
+ # Load the audio file
71
+ waveform, sample_rate = torchaudio.load(audio_file)
72
+
73
+ # Ensure mono-channel audio
74
+ if waveform.shape[0] > 1:
75
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
76
+
77
+ # Convert to a 16kHz sample rate if not already
78
+ if sample_rate != 16000:
79
+ waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
80
+
81
+ # Convert to a list of integers
82
+ audio_input = waveform.squeeze().numpy().astype(int).tolist()
83
+
84
+ # Use Hugging Face's ASR pipeline
85
+ asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
86
+
87
+ # Transcribe the audio
88
+ transcript = asr_pipeline(waveform.numpy()[0])
89
+
90
+ return transcript
91
+
92
+ transcription = transcribe_audio(audio)
93
+ st.write("Aqui tienes tu transcripcion:")
94
+ st.write(transcription)
95
+ elif option_language == 'German':
96
+ def transcribe_audio(audio_file):
97
+
98
+ # Load the audio file
99
+ waveform, sample_rate = torchaudio.load(audio_file)
100
+
101
+ # Ensure mono-channel audio
102
+ if waveform.shape[0] > 1:
103
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
104
+
105
+ # Convert to a 16kHz sample rate if not already
106
+ if sample_rate != 16000:
107
+ waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
108
+
109
+ # Convert to a list of integers
110
+ audio_input = waveform.squeeze().numpy().astype(int).tolist()
111
+
112
+ # Use Hugging Face's ASR pipeline
113
+ asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german")
114
+
115
+ # Transcribe the audio
116
+ transcript = asr_pipeline(waveform.numpy()[0])
117
+
118
+ return transcript
119
+
120
+ transcription = transcribe_audio(audio)
121
+ st.write("Hier ist Ihre Transkription:")
122
+ st.write(transcription)
123
+ elif option_language == "French":
124
+ def transcribe_audio(audio_file):
125
+
126
+ # Load the audio file
127
+ waveform, sample_rate = torchaudio.load(audio_file)
128
+
129
+ # Ensure mono-channel audio
130
+ if waveform.shape[0] > 1:
131
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
132
+
133
+ # Convert to a 16kHz sample rate if not already
134
+ if sample_rate != 16000:
135
+ waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
136
+
137
+ # Convert to a list of integers
138
+ audio_input = waveform.squeeze().numpy().astype(int).tolist()
139
+
140
+ # Use Hugging Face's ASR pipeline
141
+ asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french")
142
+
143
+ # Transcribe the audio
144
+ transcript = asr_pipeline(waveform.numpy()[0])
145
+
146
+ return transcript
147
+
148
+ transcription = transcribe_audio(audio)
149
+ st.write("Ici, vous avez votre transcription")
150
+ st.write(transcription)
151
+
152
+ elif option_language == "Chinese":
153
+ def transcribe_audio(audio_file):
154
+
155
+ # Load the audio file
156
+ waveform, sample_rate = torchaudio.load(audio_file)
157
+
158
+ # Ensure mono-channel audio
159
+ if waveform.shape[0] > 1:
160
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
161
+
162
+ # Convert to a 16kHz sample rate if not already
163
+ if sample_rate != 16000:
164
+ waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
165
+
166
+ # Convert to a list of integers
167
+ audio_input = waveform.squeeze().numpy().astype(int).tolist()
168
+
169
+ # Use Hugging Face's ASR pipeline
170
+ asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test")
171
+
172
+ # Transcribe the audio
173
+ transcript = asr_pipeline(waveform.numpy()[0])
174
+
175
+ return transcript
176
+
177
+ transcription = transcribe_audio(audio)
178
+ st.write("这是您的转录。")
179
+ st.write(transcription)
180
+
pages/Summarization.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BartForConditionalGeneration, BartTokenizer
2
+ import streamlit as st
3
+ import torch
4
+ from transformers import AutoProcessor, WhisperForConditionalGeneration
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ import torchaudio
7
+ from transformers import pipeline
8
+
9
+ # Load your own audio file
10
+
11
+ audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
12
+
13
+ option_language = st.selectbox(
14
+ 'Select the language of your audio',
15
+ ('English', 'Spanish', 'German','French','Chinese'))
16
+
17
+ if audio == None:
18
+ st.write("Please upload the audio in the box above")
19
+
20
+
21
+
22
+ else:
23
+ if option_language == "English":
24
+ def transcribe_audio(audio_file):
25
+ # Load the audio file
26
+ waveform, sample_rate = torchaudio.load(audio_file)
27
+
28
+ # Ensure mono-channel audio
29
+ if waveform.shape[0] > 1:
30
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
31
+
32
+ # Convert to a 16kHz sample rate if not already
33
+ if sample_rate != 16000:
34
+ waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
35
+
36
+ # Convert to a list of integers
37
+ audio_input = waveform.squeeze().numpy().astype(int).tolist()
38
+
39
+ # Use Hugging Face's ASR pipeline
40
+ asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
41
+
42
+ # Transcribe the audio
43
+ transcript = asr_pipeline(waveform.numpy()[0])
44
+
45
+ return transcript
46
+
47
+ transcription = transcribe_audio(audio)
48
+ print("Transcription",transcription)
49
+
50
+ ## Inititate Summary Model
51
+ tokenizer_summary = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
52
+ model_summary = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
53
+
54
+
55
+ def summarize_text(text, model, tokenizer, max_length=100):
56
+ input_ids = tokenizer.encode(text, return_tensors="pt")
57
+ summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
58
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
59
+
60
+
61
+ summary = summarize_text(transcription['text'], model_summary, tokenizer_summary)
62
+ st.write("Here is your summary!")
63
+ st.write(summary)
64
+
65
+
66
+ elif option_language == 'Spanish':
67
+
68
+ def transcribe_audio(audio_file):
69
+
70
+ # Load the audio file
71
+ waveform, sample_rate = torchaudio.load(audio_file)
72
+
73
+ # Ensure mono-channel audio
74
+ if waveform.shape[0] > 1:
75
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
76
+
77
+ # Convert to a 16kHz sample rate if not already
78
+ if sample_rate != 16000:
79
+ waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
80
+
81
+ # Convert to a list of integers
82
+ audio_input = waveform.squeeze().numpy().astype(int).tolist()
83
+
84
+ # Use Hugging Face's ASR pipeline
85
+ asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
86
+
87
+ # Transcribe the audio
88
+ transcript = asr_pipeline(waveform.numpy()[0])
89
+
90
+ return transcript
91
+
92
+ transcription = transcribe_audio(audio)
93
+ print("Aqui tienes tu transcripción:",transcription)
94
+
95
+ ## Inititate Summary Model
96
+
97
+ tokenizer_summary = AutoTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="es_XX")
98
+ model_summary = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")
99
+
100
+
101
+ def summarize_text(text, model, tokenizer, max_length=100):
102
+ input_ids = tokenizer.encode(text, return_tensors="pt")
103
+ summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
104
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
105
+
106
+
107
+ summary = summarize_text(transcription['text'], model_summary, tokenizer_summary)
108
+ st.write("Aqui tienes tu resumen!")
109
+ st.write(summary)
pages/Text Classification.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
3
+ from datasets import load_dataset
4
+ import torch
5
+ import streamlit as st
6
+ from PIL import Image
7
+ import PyPDF2
8
+ from pypdf.errors import PdfReadError
9
+ from pypdf import PdfReader
10
+ import pypdfium2 as pdfium
11
+
12
+ document = st.file_uploader(label="Upload the document you want to explore",type=["png",'jpg', "jpeg","pdf"])
13
+
14
+
15
+ model_option = st.selectbox("Select the output of the model:",["Classification","Extract Info"])
16
+ if model_option == "Classification":
17
+ processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
18
+ model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
19
+
20
+ device = "cpu"
21
+ model.to(device)
22
+ # load document image
23
+ if document == None:
24
+ st.write("Please upload the document in the box above")
25
+ else:
26
+ try:
27
+ PdfReader(document)
28
+ pdf = pdfium.PdfDocument(document)
29
+ page = pdf.get_page(0)
30
+ pil_image = page.render(scale = 300/72).to_pil()
31
+
32
+ task_prompt = "<s_rvlcdip>"
33
+ decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
34
+
35
+ pixel_values = processor(pil_image, return_tensors="pt").pixel_values
36
+
37
+ outputs = model.generate(
38
+ pixel_values.to(device),
39
+ decoder_input_ids=decoder_input_ids.to(device),
40
+ max_length=model.decoder.config.max_position_embeddings,
41
+ pad_token_id=processor.tokenizer.pad_token_id,
42
+ eos_token_id=processor.tokenizer.eos_token_id,
43
+ use_cache=True,
44
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
45
+ return_dict_in_generate=True,
46
+ )
47
+
48
+ sequence = processor.batch_decode(outputs.sequences)[0]
49
+ sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
50
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
51
+ st.image(pil_image,"Document uploaded")
52
+ st.write(processor.token2json(sequence))
53
+
54
+ except PdfReadError:
55
+ document = Image.open(document)
56
+ task_prompt = "<s_rvlcdip>"
57
+ decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
58
+
59
+ pixel_values = processor(document, return_tensors="pt").pixel_values
60
+
61
+ outputs = model.generate(
62
+ pixel_values.to(device),
63
+ decoder_input_ids=decoder_input_ids.to(device),
64
+ max_length=model.decoder.config.max_position_embeddings,
65
+ pad_token_id=processor.tokenizer.pad_token_id,
66
+ eos_token_id=processor.tokenizer.eos_token_id,
67
+ use_cache=True,
68
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
69
+ return_dict_in_generate=True,
70
+ )
71
+
72
+ sequence = processor.batch_decode(outputs.sequences)[0]
73
+ sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
74
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
75
+ st.image(document,"Document uploaded")
76
+ st.write(processor.token2json(sequence))
77
+
78
+
79
+ elif model_option == "Extract Info":
80
+ processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
81
+ model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
82
+
83
+ device = "cpu"
84
+ model.to(device)
85
+ # load document image
86
+ if document == None:
87
+ st.write("Please upload the document in the box above")
88
+ else:
89
+ try:
90
+ PdfReader(document)
91
+ pdf = pdfium.PdfDocument(document)
92
+ page = pdf.get_page(0)
93
+ pil_image = page.render(scale = 300/72).to_pil()
94
+
95
+ task_prompt = "<s_cord-v2>"
96
+ decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
97
+
98
+ pixel_values = processor(pil_image, return_tensors="pt").pixel_values
99
+
100
+ outputs = model.generate(
101
+ pixel_values.to(device),
102
+ decoder_input_ids=decoder_input_ids.to(device),
103
+ max_length=model.decoder.config.max_position_embeddings,
104
+ pad_token_id=processor.tokenizer.pad_token_id,
105
+ eos_token_id=processor.tokenizer.eos_token_id,
106
+ use_cache=True,
107
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
108
+ return_dict_in_generate=True,
109
+ )
110
+
111
+ sequence = processor.batch_decode(outputs.sequences)[0]
112
+ sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
113
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
114
+ st.image(pil_image,"Document uploaded")
115
+ st.write(processor.token2json(sequence))
116
+
117
+ except PdfReadError:
118
+ document = Image.open(document)
119
+ task_prompt = "<s_cord-v2>"
120
+ decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
121
+
122
+ pixel_values = processor(document, return_tensors="pt").pixel_values
123
+
124
+ outputs = model.generate(
125
+ pixel_values.to(device),
126
+ decoder_input_ids=decoder_input_ids.to(device),
127
+ max_length=model.decoder.config.max_position_embeddings,
128
+ pad_token_id=processor.tokenizer.pad_token_id,
129
+ eos_token_id=processor.tokenizer.eos_token_id,
130
+ use_cache=True,
131
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
132
+ return_dict_in_generate=True,
133
+ )
134
+
135
+ sequence = processor.batch_decode(outputs.sequences)[0]
136
+ sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
137
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
138
+ st.image(document,"Document uploaded")
139
+ st.write(processor.token2json(sequence))
pages/Text Generation.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_mic_recorder import mic_recorder,speech_to_text
3
+
4
+ state=st.session_state
5
+
6
+ if 'text_received' not in state:
7
+ state.text_received=[]
8
+
9
+ c1,c2=st.columns(2)
10
+ with c1:
11
+ st.write("Convert speech to text:")
12
+ with c2:
13
+ text=speech_to_text(language='en',use_container_width=True,just_once=True,key='STT')
14
+
15
+ if text:
16
+ state.text_received.append(text)
17
+
18
+ for text in state.text_received:
19
+ st.text(text)
20
+
21
+ st.write("Record your voice, and play the recorded audio:")
22
+ audio=mic_recorder(start_prompt="⏺️",stop_prompt="⏹️",key='recorder')
23
+
24
+ if audio:
25
+ st.audio(audio['bytes'])
pages/Text to Image.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from diffusers import LCMScheduler, AutoPipelineForText2Image
3
+ import streamlit as st
4
+
5
+ model_id = "stabilityai/stable-diffusion-xl-base-1.0"
6
+ adapter_id = "latent-consistency/lcm-lora-sdxl"
7
+
8
+ pipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float32, variant="fp16")
9
+ pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
10
+ #pipe.to("cuda")
11
+
12
+ # load and fuse lcm lora
13
+ pipe.load_lora_weights(adapter_id)
14
+ pipe.fuse_lora()
15
+ prompt = st.text_input(str("Insert here you prompt?"))
16
+
17
+ # disable guidance_scale by passing 0
18
+ image = pipe(prompt=prompt, num_inference_steps=4, guidance_scale=0).images[0]
19
+ st.image(image,"Image generated by your prompt {promt}")
style.css ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* styles.css */
3
+
4
+ .title {
5
+ color: #ffffff;
6
+ font-size: 34px;
7
+ font-weight: bold;
8
+ font-family: monospace;
9
+ }
10
+
11
+ .custom-text {
12
+ color: #ffffff;
13
+ font-size: 20px;
14
+ font-weight: bold;
15
+ font-family: monospace;
16
+
17
+ }
18
+
19
+ .custom-background {
20
+ background-color: rgb(110, 159, 238);
21
+ padding: 12px;
22
+ font-size: 16px;
23
+ font-family: monospace;
24
+
25
+ }
26
+
27
+ /* Style inputs with type="text", type="email"and textareas */
28
+ input[type=text], input[type=email], textarea {
29
+ width: 100%; /* Full width */
30
+ padding: 12px; /* Some padding */
31
+ border: 1px solid #ccc; /* Gray border */
32
+ border-radius: 4px; /* Rounded borders */
33
+ box-sizing: border-box; /* Make sure that padding and width stays in place */
34
+ margin-top: 6px; /* Add a top margin */
35
+ margin-bottom: 16px; /* Bottom margin */
36
+ resize: vertical /* Allow the user to vertically resize the textarea (not horizontally) */
37
+ }
38
+
39
+ /* Style the submit button with a specific background color etc */
40
+ button[type=submit] {
41
+ background-color: #04AA6D;
42
+ color: white;
43
+ padding: 12px 20px;
44
+ border: none;
45
+ border-radius: 4px;
46
+ cursor: pointer;
47
+ }
48
+
49
+ /* When moving the mouse over the submit button, add a darker green color */
50
+ button[type=submit]:hover {
51
+ background-color: #45a049;
52
+ }
53
+
54
+