Spaces:
Build error
Build error
import streamlit as st | |
import urllib.request | |
import PyPDF2 | |
import re | |
import pandas as pd | |
def convert_pdf_to_txt(pdf_file): | |
pdf_reader = PyPDF2.PdfFileReader(pdf_file) | |
text = '' | |
for i in range(pdf_reader.numPages): | |
text += pdf_reader.getPage(i).extractText() | |
return text | |
def preprocess_text(text): | |
# Preprocess the text data to remove unwanted characters and convert to lowercase | |
text = re.sub(r'[^\w\s]', '', text) | |
text = text.lower() | |
return text | |
def download_book(url): | |
response = urllib.request.urlopen(url) | |
book = response.read() | |
return book | |
def upload_book(): | |
uploaded_file = st.file_uploader("Choose a book file", type=["pdf", "txt"]) | |
if uploaded_file is None: | |
return None | |
return uploaded_file.read() | |
def main(): | |
st.set_page_config(page_title="Book to Dataset Converter", page_icon=":book:", layout="wide") | |
st.title("Book to Dataset Converter") | |
st.write("This app allows you to convert a book to a dataset that can be used to train AI models.") | |
source = st.sidebar.radio("Select source of book", ("URL", "Upload")) | |
if source == "URL": | |
url = st.sidebar.text_input("Enter URL of book") | |
if st.button("Convert"): | |
with st.spinner("Downloading book..."): | |
book = download_book(url) | |
if book is None: | |
st.error("Failed to download book") | |
with st.spinner("Converting book to dataset..."): | |
text = convert_pdf_to_txt(book) | |
text = preprocess_text(text) | |
dataset = pd.DataFrame({'text': [text]}) | |
st.write(dataset) | |
else: | |
if st.button("Upload"): | |
uploaded_file = upload_book() | |
if uploaded_file is None: | |
st.error("Failed to upload book") | |
else: | |
if uploaded_file.endswith(b".pdf"): | |
with st.spinner("Converting book to dataset..."): | |
text = convert_pdf_to_txt(uploaded_file) | |
text = preprocess_text(text) | |
dataset = pd.DataFrame({'text': [text]}) | |
st.write(dataset) | |
elif uploaded_file.endswith(b".txt"): | |
with st.spinner("Converting book to dataset..."): | |
text = uploaded_file.decode('utf-8') | |
text = preprocess_text(text) | |
dataset = pd.DataFrame({'text': [text]}) | |
st.write(dataset) | |
else: | |
st.error("Invalid file format. Please upload a book in pdf or txt format.") | |
main() |