Fetch-Content / app.py
KingNish's picture
Update app.py
413592b verified
raw
history blame
7.53 kB
import PyPDF2
from openpyxl import load_workbook
from pptx import Presentation
import gradio as gr
import io
import re
import zipfile
import xml.etree.ElementTree as ET
import filetype
import requests
import os
import mimetypes
# Constants
CHUNK_SIZE = 32000
# --- Utility Functions ---
def xml2text(xml):
"""Extracts text from XML data."""
text = u''
root = ET.fromstring(xml)
for child in root.iter():
text += child.text + " " if child.text is not None else ''
return text
def clean_text(content):
"""Cleans text content based on the 'clean' parameter."""
content = content.replace('\n', ' ')
content = content.replace('\r', ' ')
content = content.replace('\t', ' ')
content = re.sub(r'\s+', ' ', content)
return content
def split_content(content, chunk_size=CHUNK_SIZE):
"""Splits content into chunks of a specified size."""
chunks = []
for i in range(0, len(content), chunk_size):
chunks.append(content[i:i + chunk_size])
return chunks
# --- Document Reading Functions ---
def extract_text_from_docx(docx_data, clean=True):
"""Extracts text from DOCX files."""
text = u''
zipf = zipfile.ZipFile(io.BytesIO(docx_data))
filelist = zipf.namelist()
header_xmls = 'word/header[0-9]*.xml'
for fname in filelist:
if re.match(header_xmls, fname):
text += xml2text(zipf.read(fname))
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))
footer_xmls = 'word/footer[0-9]*.xml'
for fname in filelist:
if re.match(footer_xmls, fname):
text += xml2text(zipf.read(fname))
zipf.close()
if clean:
text = clean_text(text)
return text, len(text)
def extract_text_from_pptx(pptx_data, clean=True):
"""Extracts text from PPT files."""
text = u''
zipf = zipfile.ZipFile(io.BytesIO(pptx_data))
filelist = zipf.namelist()
# Extract text from slide notes
notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
for fname in filelist:
if re.match(notes_xmls, fname):
text += xml2text(zipf.read(fname))
# Extract text from slide content (shapes and text boxes)
slide_xmls = 'ppt/slides/slide[0-9]*.xml'
for fname in filelist:
if re.match(slide_xmls, fname):
text += xml2text(zipf.read(fname))
zipf.close()
if clean:
text = clean_text(text)
return text, len(text)
def read_document(file_path, clean=True):
with open(file_path, "rb") as f:
file_content = f.read()
kind = filetype.guess(file_content)
if kind is None:
mime = "text"
else:
mime = kind.mime
if mime == "application/pdf":
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
content = ''
for page in range(len(pdf_reader.pages)):
content += pdf_reader.pages[page].extract_text()
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading PDF: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
try:
wb = load_workbook(io.BytesIO(file_content))
content = ''
for sheet in wb.worksheets:
for row in sheet.rows:
for cell in row:
if cell.value is not None:
content += str(cell.value) + ' '
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading XLSX: {e}", 0
elif mime == "text/plain":
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading TXT file: {e}", 0
elif mime == "text/csv":
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading CSV file: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
try:
return extract_text_from_docx(file_content, clean)
except Exception as e:
return f"Error reading DOCX: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
try:
return extract_text_from_pptx(file_content, clean)
except Exception as e:
return f"Error reading PPTX: {e}", 0
else:
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading file: {e}", 0
def download_and_process_file(url, clean=True):
"""Downloads a file from a URL and returns the local file path."""
if not url.startswith("http://") and not url.startswith("https://"):
url = "http://" + url # Prepend "http://" if not present
try:
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an exception for bad status codes
# Generate a safe and unique temporary filename
original_filename = os.path.basename(url)
# Remove invalid characters from filename
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
temp_filename = f"{safe_filename}"
# Infer file extension from content type
content_type = response.headers['content-type']
ext = mimetypes.guess_extension(content_type)
if ext and not temp_filename.endswith(ext): # Append extension if not already present
temp_filename += ext
with open(temp_filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192000):
f.write(chunk)
# Check if it's an image type
kind = filetype.guess(temp_filename)
if kind and kind.mime.startswith('image/'):
return f"![]({url})", 0 # Return markdown image syntax if it's an image
else:
return read_document(temp_filename, clean) # Otherwise, process as a document
except requests.exceptions.MissingSchema:
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
except requests.exceptions.ConnectionError:
return "Error: Could not connect to the server. Please check your internet connection.", 0
except requests.exceptions.RequestException as e:
return f"Error downloading file: {e}", 0
# --- Gradio Interface ---
iface = gr.Interface(
fn=download_and_process_file,
inputs=[
gr.Textbox(lines=1, placeholder="Enter URL of the file"),
gr.Checkbox(label="Clean Text", value=True),
],
outputs=[
gr.Textbox(label="Document Content/Image Markdown"),
gr.Number(label="Document Length (characters)"),
],
title="Enhanced File Processor for Hugging Face Chat Tools",
description="Enter the URL of site and extract its content"
"This tool is designed for use with Hugging Face Chat Tools: "
"[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
concurrency_limit=None
)
iface.launch()