Spaces:
Sleeping
Sleeping
import PyPDF2 | |
from openpyxl import load_workbook | |
from pptx import Presentation | |
import gradio as gr | |
import io | |
import re | |
import zipfile | |
import xml.etree.ElementTree as ET | |
import filetype | |
import requests | |
import os | |
import mimetypes | |
# Constants | |
CHUNK_SIZE = 32000 | |
# --- Utility Functions --- | |
def xml2text(xml): | |
"""Extracts text from XML data.""" | |
text = u'' | |
root = ET.fromstring(xml) | |
for child in root.iter(): | |
text += child.text + " " if child.text is not None else '' | |
return text | |
def clean_text(content): | |
"""Cleans text content based on the 'clean' parameter.""" | |
content = content.replace('\n', ' ') | |
content = content.replace('\r', ' ') | |
content = content.replace('\t', ' ') | |
content = re.sub(r'\s+', ' ', content) | |
return content | |
def split_content(content, chunk_size=CHUNK_SIZE): | |
"""Splits content into chunks of a specified size.""" | |
chunks = [] | |
for i in range(0, len(content), chunk_size): | |
chunks.append(content[i:i + chunk_size]) | |
return chunks | |
# --- Document Reading Functions --- | |
def extract_text_from_docx(docx_data, clean=True): | |
"""Extracts text from DOCX files.""" | |
text = u'' | |
zipf = zipfile.ZipFile(io.BytesIO(docx_data)) | |
filelist = zipf.namelist() | |
header_xmls = 'word/header[0-9]*.xml' | |
for fname in filelist: | |
if re.match(header_xmls, fname): | |
text += xml2text(zipf.read(fname)) | |
doc_xml = 'word/document.xml' | |
text += xml2text(zipf.read(doc_xml)) | |
footer_xmls = 'word/footer[0-9]*.xml' | |
for fname in filelist: | |
if re.match(footer_xmls, fname): | |
text += xml2text(zipf.read(fname)) | |
zipf.close() | |
if clean: | |
text = clean_text(text) | |
return text, len(text) | |
def extract_text_from_pptx(pptx_data, clean=True): | |
"""Extracts text from PPT files.""" | |
text = u'' | |
zipf = zipfile.ZipFile(io.BytesIO(pptx_data)) | |
filelist = zipf.namelist() | |
# Extract text from slide notes | |
notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml' | |
for fname in filelist: | |
if re.match(notes_xmls, fname): | |
text += xml2text(zipf.read(fname)) | |
# Extract text from slide content (shapes and text boxes) | |
slide_xmls = 'ppt/slides/slide[0-9]*.xml' | |
for fname in filelist: | |
if re.match(slide_xmls, fname): | |
text += xml2text(zipf.read(fname)) | |
zipf.close() | |
if clean: | |
text = clean_text(text) | |
return text, len(text) | |
def read_document(file_path, clean=True): | |
with open(file_path, "rb") as f: | |
file_content = f.read() | |
kind = filetype.guess(file_content) | |
if kind is None: | |
mime = "text" | |
else: | |
mime = kind.mime | |
if mime == "application/pdf": | |
try: | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) | |
content = '' | |
for page in range(len(pdf_reader.pages)): | |
content += pdf_reader.pages[page].extract_text() | |
if clean: | |
content = clean_text(content) | |
return content, len(repr(content)) | |
except Exception as e: | |
return f"Error reading PDF: {e}", 0 | |
elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": | |
try: | |
wb = load_workbook(io.BytesIO(file_content)) | |
content = '' | |
for sheet in wb.worksheets: | |
for row in sheet.rows: | |
for cell in row: | |
if cell.value is not None: | |
content += str(cell.value) + ' ' | |
if clean: | |
content = clean_text(content) | |
return content, len(repr(content)) | |
except Exception as e: | |
return f"Error reading XLSX: {e}", 0 | |
elif mime == "text/plain": | |
try: | |
content = file_content.decode('utf-8') | |
if clean: | |
content = clean_text(content) | |
return content, len(repr(content)) | |
except Exception as e: | |
return f"Error reading TXT file: {e}", 0 | |
elif mime == "text/csv": | |
try: | |
content = file_content.decode('utf-8') | |
if clean: | |
content = clean_text(content) | |
return content, len(repr(content)) | |
except Exception as e: | |
return f"Error reading CSV file: {e}", 0 | |
elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
try: | |
return extract_text_from_docx(file_content, clean) | |
except Exception as e: | |
return f"Error reading DOCX: {e}", 0 | |
elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation": | |
try: | |
return extract_text_from_pptx(file_content, clean) | |
except Exception as e: | |
return f"Error reading PPTX: {e}", 0 | |
else: | |
try: | |
content = file_content.decode('utf-8') | |
if clean: | |
content = clean_text(content) | |
return content, len(repr(content)) | |
except Exception as e: | |
return f"Error reading file: {e}", 0 | |
def download_and_process_file(url, clean=True): | |
"""Downloads a file from a URL and returns the local file path.""" | |
if not url.startswith("http://") and not url.startswith("https://"): | |
url = "http://" + url # Prepend "http://" if not present | |
try: | |
response = requests.get(url, stream=True) | |
response.raise_for_status() # Raise an exception for bad status codes | |
# Generate a safe and unique temporary filename | |
original_filename = os.path.basename(url) | |
# Remove invalid characters from filename | |
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename) | |
temp_filename = f"{safe_filename}" | |
# Infer file extension from content type | |
content_type = response.headers['content-type'] | |
ext = mimetypes.guess_extension(content_type) | |
if ext and not temp_filename.endswith(ext): # Append extension if not already present | |
temp_filename += ext | |
with open(temp_filename, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=8192000): | |
f.write(chunk) | |
# Check if it's an image type | |
kind = filetype.guess(temp_filename) | |
if kind and kind.mime.startswith('image/'): | |
return f"", 0 # Return markdown image syntax if it's an image | |
else: | |
return read_document(temp_filename, clean) # Otherwise, process as a document | |
except requests.exceptions.MissingSchema: | |
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0 | |
except requests.exceptions.ConnectionError: | |
return "Error: Could not connect to the server. Please check your internet connection.", 0 | |
except requests.exceptions.RequestException as e: | |
return f"Error downloading file: {e}", 0 | |
# --- Gradio Interface --- | |
iface = gr.Interface( | |
fn=download_and_process_file, | |
inputs=[ | |
gr.Textbox(lines=1, placeholder="Enter URL of the file"), | |
gr.Checkbox(label="Clean Text", value=True), | |
], | |
outputs=[ | |
gr.Textbox(label="Document Content/Image Markdown"), | |
gr.Number(label="Document Length (characters)"), | |
], | |
title="Enhanced File Processor for Hugging Face Chat Tools", | |
description="Enter the URL of site and extract its content" | |
"This tool is designed for use with Hugging Face Chat Tools: " | |
"[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)", | |
concurrency_limit=None | |
) | |
iface.launch() |