BookTODataset / app.py
imseldrith's picture
Update app.py
e8e2695
import streamlit as st
import urllib.request
import PyPDF2
import re
import pandas as pd
def convert_pdf_to_txt(pdf_file):
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
text = ''
for i in range(pdf_reader.numPages):
text += pdf_reader.getPage(i).extractText()
return text
def preprocess_text(text):
# Preprocess the text data to remove unwanted characters and convert to lowercase
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text
def download_book(url):
response = urllib.request.urlopen(url)
book = response.read()
return book
def upload_book():
uploaded_file = st.file_uploader("Choose a book file", type=["pdf", "txt"])
if uploaded_file is None:
return None
return uploaded_file.read()
def main():
st.set_page_config(page_title="Book to Dataset Converter", page_icon=":book:", layout="wide")
st.title("Book to Dataset Converter")
st.write("This app allows you to convert a book to a dataset that can be used to train AI models.")
source = st.sidebar.radio("Select source of book", ("URL", "Upload"))
if source == "URL":
url = st.sidebar.text_input("Enter URL of book")
if st.button("Convert"):
with st.spinner("Downloading book..."):
book = download_book(url)
if book is None:
st.error("Failed to download book")
with st.spinner("Converting book to dataset..."):
text = convert_pdf_to_txt(book)
text = preprocess_text(text)
dataset = pd.DataFrame({'text': [text]})
st.write(dataset)
else:
if st.button("Upload"):
uploaded_file = upload_book()
if uploaded_file is None:
st.error("Failed to upload book")
else:
if uploaded_file.endswith(b".pdf"):
with st.spinner("Converting book to dataset..."):
text = convert_pdf_to_txt(uploaded_file)
text = preprocess_text(text)
dataset = pd.DataFrame({'text': [text]})
st.write(dataset)
elif uploaded_file.endswith(b".txt"):
with st.spinner("Converting book to dataset..."):
text = uploaded_file.decode('utf-8')
text = preprocess_text(text)
dataset = pd.DataFrame({'text': [text]})
st.write(dataset)
else:
st.error("Invalid file format. Please upload a book in pdf or txt format.")
main()