Spaces:

Srisurya-teja
/

Web-Based-Text-Extraction-and-Retrieval-System

Sleeping

App Files Files Community

srisurya commited on Sep 29, 2024

Commit

5e66110

1 Parent(s): 2935122

final

Browse files

Files changed (6) hide show

.gitignore +1 -0
app.py +163 -0
images/hindi.jpg +0 -0
images/test.jpg +0 -0
images/testocr.png +0 -0
requirements.txt +12 -0

.gitignore CHANGED Viewed

@@ -122,6 +122,7 @@ celerybeat.pid
 *.sage.py
 # Environments
 .env
 .venv
 env/

 *.sage.py
 # Environments
+ocrenv/
 .env
 .venv
 env/

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from transformers import AutoModel, AutoTokenizer,AutoProcessor
+import streamlit as st
+import os
+from PIL import Image
+import torch
+from torchvision import io
+import torchvision.transforms as transforms
+import random
+import easyocr
+import numpy as np
+def start():
+    st.session_state.start = True
+def reset():
+    del st.session_state['start']
+@st.cache_data
+def get_text(image_file, _model, _tokenizer):
+    res = _model.chat(_tokenizer, image_file, ocr_type='ocr')
+    return res
+@st.cache_data
+def extract_text_easyocr(_image):
+    reader = easyocr.Reader(['hi'],gpu = False)
+    results = reader.readtext(np.array(_image))
+    # return results
+    return " ".join([result[1] for result in results])
+@st.cache_resource
+def model():
+    tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
+    model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
+    model = model.eval()
+    return model, tokenizer
+@st.cache_resource
+def highlight_keywords(text, keywords):
+    colors = generate_unique_colors(len(keywords))
+    highlighted_text = text
+    found_keywords = []
+    for keyword, color in zip(keywords, colors):
+        if keyword.lower() in text.lower():
+            highlighted_text = highlighted_text.replace(keyword, f'<mark style="background-color: {color};">{keyword}</mark>')
+            found_keywords.append(keyword)
+    return highlighted_text, found_keywords
+def search():
+    st.session_state.search = True
+@st.cache_data
+def generate_unique_colors(n):
+    colors = []
+    for i in range(n):
+        color = "#{:06x}".format(random.randint(0, 0xFFFFFF))
+        while color in colors:
+            color = "#{:06x}".format(random.randint(0, 0xFFFFFF))
+        colors.append(color)
+    return colors
+st.title("A Web-Based Text Extraction and Retrieval System")
+language = st.selectbox("Select a language:", ["English", "Hindi"])
+if language == "English":
+    st.subheader("You selected English!")
+    st.button("Let's get started", on_click=start)
+    if 'start' not in st.session_state:
+        st.session_state.start = False
+    if 'search' not in st.session_state:
+        st.session_state.search = False
+    if 'reset' not in st.session_state:
+        st.session_state.reset = False
+    if st.session_state.start:
+        uploaded_file = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
+        if uploaded_file is not None:
+            st.subheader("Uploaded Image:")
+            st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
+            MODEL, TOKENIZER = model()
+            if not os.path.exists("images"):
+                os.makedirs("images")
+            with open(f"images/{uploaded_file.name}", "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            extracted_text = get_text(f"images/{uploaded_file.name}", MODEL, TOKENIZER)
+            st.subheader("Extracted Text")
+            st.write(extracted_text)
+            keywords_input = st.text_input("Enter keywords to search within the extracted text (comma-separated):")
+            if keywords_input:
+                keywords = [keyword.strip() for keyword in keywords_input.split(',')]
+                highlighted_text, found_keywords = highlight_keywords(extracted_text, keywords)
+                st.button("Search", on_click=search)
+                if st.session_state.search:
+                    st.subheader("Search Results:")
+                    if found_keywords:
+                        st.markdown(highlighted_text, unsafe_allow_html=True)
+                        st.write(f"Found keywords: {', '.join(found_keywords)}")
+                    else:
+                        st.warning("No keywords were found in the extracted text.")
+                    not_found_keywords = set(keywords) - set(found_keywords)
+                    if not_found_keywords:
+                        st.error(f"Keywords not found: {', '.join(not_found_keywords)}")
+            st.button("Reset", on_click=reset)
+elif language == "Hindi":
+    st.subheader("You selected HINDI!")
+    st.button("Let's get started", on_click=start)
+    if 'start' not in st.session_state:
+        st.session_state.start = False
+    if 'search' not in st.session_state:
+        st.session_state.search = False
+    if 'reset' not in st.session_state:
+        st.session_state.reset = False
+    if st.session_state.start:
+        uploaded_file = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
+        if uploaded_file is not None:
+          st.subheader("Uploaded Image:")
+          st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
+          image = Image.open(uploaded_file)
+          if not os.path.exists("images"):
+                os.makedirs("images")
+          with open(f"images/{uploaded_file.name}", "wb") as f:
+                f.write(uploaded_file.getbuffer())
+          extracted_text_hindi =extract_text_easyocr(image)
+          st.subheader("Extracted Text:")
+          st.write(extracted_text_hindi)
+          keywords_input = st.text_input("Enter keywords to search within the extracted text (comma-separated):")
+          if keywords_input:
+                keywords = [keyword.strip() for keyword in keywords_input.split(',')]
+                highlighted_text, found_keywords = highlight_keywords(extracted_text_hindi, keywords)
+                st.button("Search", on_click=search)
+                if st.session_state.search:
+                    st.subheader("Search Results:")
+                    if found_keywords:
+                        st.markdown(highlighted_text, unsafe_allow_html=True)
+                        st.write(f"Found keywords: {', '.join(found_keywords)}")
+                    else:
+                        st.warning("No keywords were found in the extracted text.")
+                    not_found_keywords = set(keywords) - set(found_keywords)
+                    if not_found_keywords:
+                        st.error(f"Keywords not found: {', '.join(not_found_keywords)}")
+          st.button("Reset", on_click=reset)

images/hindi.jpg ADDED Viewed

images/test.jpg ADDED Viewed

images/testocr.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch
+torchvision
+torchaudio
+transformers
+Pillow
+verovio
+tiktoken
+importlib_resources
+accelerate>=0.26.0
+safetensors
+streamlit
+easyocr