Spaces:

OdiaGenAI
/

Olive_scrapper

Runtime error

File size: 14,171 Bytes

import justext
import streamlit as st
from lxml import etree
# import streamlit.components.v1 as components


# File Processing pkgs
from PIL import Image
import requests
# import xml.dom.minidom
from bs4 import BeautifulSoup
# import json
import docx2txt
# import textract
from PyPDF2 import PdfFileReader
import pdfplumber
import os



# ---- LOAD ASSETS ----
img_page_icon = Image.open("./olive_webscrapping.jpg")

# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")


# Load CSS file
def load_css(file_path):
    with open(file_path) as f:
        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)


# Load CSS file
load_css('styles.css')


# ----- FUNCTIONS ----
# function to check whether the url is a sitemap or not
def check_sitemap(url):
    # Check the URL's ending
    if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
        try:
            # Parse the content as XML
            response = requests.get(url)
            xml_content = etree.fromstring(response.content)
            # Check for sitemap-specific elements
            if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
                return True
        except etree.XMLSyntaxError:
            pass

    # Additional conditions for identifying sitemaps
    if 'sitemap' in url.lower():
        # Perform additional checks specific to the website's structure or naming conventions
        return True

    return False


# function to get urls from the site map and extract those data
def extract_urls_from_sitemaps(xml_url):
    # Make a GET request to the URL and extract the xml content
    response = requests.get(xml_url)

    soup = BeautifulSoup(response.text, 'xml')
    extracted_urls = []

    # check if the sitemap contains nested sitemaps
    sitemap_tags = soup.find_all('sitemap')
    if sitemap_tags:
        # Process nested sitemaps
        for sitemap_tag in sitemap_tags:
            print("sitemap_tags:" + sitemap_tag)
            nested_url = sitemap_tag.find('loc').text
            print('nested_url:', nested_url)
            nested_urls = extract_urls_from_sitemaps(nested_url)
            extracted_urls.extend(nested_urls)
    else:
        # Extract URLs from the current sitemap
        loc_tags = soup.find_all('loc')
        for loc_tag in loc_tags:
            # if loc_tag.parent.name != 'image':
            url = loc_tag.text
            if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
                print(f"url skipped because it is a {url.split('.')[-1]}")
            else:
                print('url:', url)
                extracted_urls.append(url)

    return extracted_urls


# function to check whether the entered url is valid
def valid_url(url):
    try:
        # Make a GET request to the URL and extract the text content
        response = requests.get(url)
        if response.status_code == 200:
            return True

    except requests.exceptions.RequestException as e:
        return False


# function to create a custom stoplist for justext
def custom_stoplist():
    odia_stopwords = [
        "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
        "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
        "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
        "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
        "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
        "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
        "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
        "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
        "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
        "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
        "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
        "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
        "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
        "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
    ]
    return frozenset(odia_stopwords)


# function to extract data from url using justext
def extract_data_from_url_(url):
    response = requests.get(url)
    response.raise_for_status()
    page = response.content

    data_url = ""
    para = ""
    paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            para = para + '\n' + paragraph.text

    data_url = ('\n\nFrom url:' + url + '\n' + para + '\n')

    return data_url


sitemap_data = ""


# function to get the text from pdf using PyPDF2
def read_pdf(file):
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    # all_page_text = ""
    # for i in range(count):
    #     page = pdfReader.getPage(i)
    #     all_page_text += page.extractText()
    #
    # return all_page_text
    return count


# function to run the enter button
def run_function(url, documents):
    data = ""
    # Check if the user has provided a URL
    if url:
        if valid_url(url):
            data = extract_data_from_url_(url)
            st.text_area("Extracted Text", value=data, height=200)
            # return extract status, and the data extracted
            return True, data
        else:
            return False, data


    # Check if the user has provided a document
    elif documents is not None:
        for document in documents:
            document_details = {
                "filename": document.name,
                "filetype": document.type,
                "filesize": document.size
            }
            st.write(document_details)

            # Extract content from the txt file
            if document.type == "text/plain":
                # Read as bytes
                data += str(document.read(), "utf-8")

            # Extract content from the pdf file
            elif document.type == "application/pdf":
                # using PyPDF2
                # data += read_pdf(document)

                # using pdfplumber
                try:
                    with pdfplumber.open(document) as pdf:
                        all_text = ""
                        for page in pdf.pages:
                            text = page.extract_text()
                            all_text += text + "\n"
                        data += all_text
                except requests.exceptions.RequestException as e:
                    st.write("None")

            # Extract content from the docx file
            elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                data += docx2txt.process(document)

        # Display the extracted text content from file
        st.write("attached")
        st.text_area("Extracted Text", value=data, height=200)
        # return extract status, and the data extracted
        return True, data

    else:
        st.error("Error: An error occurred while fetching content.")
        # return extract status, and the data extracted
        return False, data


def main():
    # ---- HEADER SECTION ----
    with st.container():
        st.subheader("Hi!! :wave:")
        st.write("##")
        st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
                    unsafe_allow_html=True)
        st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
        # st.title("Odia Generative AI")

        st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)

    # ---- BODY SECTION ----
    with st.container():
        st.subheader("Collecting monolingual data (Odia or any Indic Languages)")

        # dividing the body section into 3 columns for url, attach button and enter button
        col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
        # url/xml
        with col1:

            url_or_xml = st.text_input(label='', placeholder="Enter URL")
            is_a_sitemap = check_sitemap(url_or_xml)

        # attached files
        with col2:

            documents = st.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True)
            if not documents:
                documents = None
            else:
                for doc in documents:
                    if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
                        # if documents is not the relevant type
                        st.error("Unsupported file: " + doc.name)

        # Initialize state of button Enter
        with col3:
            st.write('##')
            if "button_enter" not in st.session_state:
                st.session_state.button_enter = False

            if st.button("Enter"):
                st.session_state.button_enter = True
                # st.write("session state true")

        if "extracted" not in st.session_state:
            st.session_state.extracted = False
        data = ""

        # the enter button
        if st.session_state.button_enter:
            # check if it is a sitemap or not
            if is_a_sitemap:
                if "Initial" not in st.session_state:
                    st.session_state.Initial = True
                # check whether its the initial state
                if st.session_state.Initial == True:
                    # print("\n\n\n\n1)Initial State", st.session_state.Initial, "\n\n\n\n\n")
                    xml = url_or_xml
                    st.write("It is a sitemap")
                    stored_sitemap_urls = extract_urls_from_sitemaps(xml)
                    print('\nno. of urls: ', len(stored_sitemap_urls))

                    if stored_sitemap_urls:
                        print(stored_sitemap_urls)
                        for sitemap_url in stored_sitemap_urls:

                            if valid_url(sitemap_url):
                                print(sitemap_url)
                                # using justext to extract data
                                data = data + extract_data_from_url_(sitemap_url)
                            else:
                                st.error("Couldnt extract data from " + sitemap_url)

                        if "sitemap_data" not in st.session_state:
                            st.session_state.sitemap_data = data
                        # print("\n\n\nst.session.data ", st.session_state.sitemap_data)
                        # print("\n\n\n\nRUNNING \n\n\n\n")
                        st.session_state.Initial = False
                        print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
                        st.session_state.extracted = True
                        # st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)

                    else:
                        st.error("Error: Invalid sitemap.")


            else:
                url = url_or_xml
                st.session_state.extracted, data = run_function(url, documents)

            if st.session_state.extracted:
                if is_a_sitemap:
                    st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
                col1, col2 = st.columns([0.5, 0.5])

                with col1:
                    saved_button = False

                    if is_a_sitemap:
                        saved_data = st.session_state.sitemap_data
 
                        if st.download_button(
                            label="Save",
                            data=saved_data                            
                        ):
                            saved_button = True
                        
                    else:
                        if st.download_button(
                            label="Save",
                            data=data                            
                        ):
                            saved_button = True

                with col2:
                    if st.button("Clear"):
                        st.session_state.button_enter = False
                        st.session_state.Initial = True
                        st.session_state.extracted = False
                        if 'sitemap_data' in st.session_state:
                            del st.session_state['sitemap_data']
                        st.session_state.button_enter = False
                        st.experimental_rerun()

                if saved_button:
                    # Confirmation message
                    st.success(f"File saved successfully.")

            else:
                st.warning("Data not extracted")


if __name__ == "__main__":
    main()