oceansweep's picture
Upload 6 files
41ba402 verified
raw
history blame
6.53 kB
# Media_wiki_tab.py
# Description: Gradio UI snippet that allows users to import a MediaWiki XML dump file into the application.
#
# Imports
import os
from threading import Thread
#
# 3rd-party Imports
import gradio as gr
#
# Local Imports
from App_Function_Libraries.MediaWiki.Media_Wiki import import_mediawiki_dump
#
#######################################################################################################################
#
# Create MediaWiki Import Tab
def create_mediawiki_import_tab():
with gr.Tab("MediaWiki Import"):
gr.Markdown("# Import MediaWiki Dump")
with gr.Row():
with gr.Column():
file_path = gr.File(label="MediaWiki XML Dump File")
wiki_name = gr.Textbox(label="Wiki Name", placeholder="Enter a unique name for this wiki")
namespaces = gr.Textbox(label="Namespaces (comma-separated integers, leave empty for all)")
skip_redirects = gr.Checkbox(label="Skip Redirects", value=True)
single_item = gr.Checkbox(label="Import as Single Item", value=False)
chunk_method = gr.Dropdown(
choices=["sentences", "words", "paragraphs", "tokens"],
value="sentences",
label="Chunking Method"
)
chunk_size = gr.Slider(minimum=100, maximum=2000, value=1000, step=100, label="Chunk Size")
chunk_overlap = gr.Slider(minimum=0, maximum=500, value=100, step=10, label="Chunk Overlap")
import_button = gr.Button("Import MediaWiki Dump")
cancel_button = gr.Button("Cancel Import", visible=False)
with gr.Column():
output = gr.Markdown(label="Import Status")
progress_bar = gr.Progress()
def validate_inputs(file_path, wiki_name, namespaces):
if not file_path:
return "Please select a MediaWiki XML dump file."
if not wiki_name:
return "Please enter a name for the wiki."
if namespaces:
try:
[int(ns.strip()) for ns in namespaces.split(',')]
except ValueError:
return "Invalid namespaces. Please enter comma-separated integers."
return None
def check_file_size(file_path):
max_size_mb = 1000 # 1 GB
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
if file_size_mb > max_size_mb:
return f"Warning: The selected file is {file_size_mb:.2f} MB. Importing large files may take a long time."
return None
import_thread = None
cancel_flag = False
def run_import(file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size,
chunk_overlap, progress=gr.Progress()):
validation_error = validate_inputs(file_path, wiki_name, namespaces)
if validation_error:
return gr.update(), gr.update(), validation_error
file_size_warning = check_file_size(file_path.name)
status_text = "# MediaWiki Import Process\n\n## Initializing\n- Starting import process...\n"
if file_size_warning:
status_text += f"- {file_size_warning}\n"
chunk_options = {
'method': chunk_method,
'max_size': chunk_size,
'overlap': chunk_overlap,
'adaptive': True,
'language': 'en'
}
namespaces_list = [int(ns.strip()) for ns in namespaces.split(',')] if namespaces else None
pages_processed = 0
try:
for progress_info in import_mediawiki_dump(
file_path=file_path.name,
wiki_name=wiki_name,
namespaces=namespaces_list,
skip_redirects=skip_redirects,
chunk_options=chunk_options,
single_item=single_item,
progress_callback=progress
):
if progress_info.startswith("Found"):
status_text += f"\n## Parsing\n- {progress_info}\n"
elif progress_info.startswith("Processed page"):
pages_processed += 1
if pages_processed % 10 == 0: # Update every 10 pages to avoid too frequent updates
status_text += f"- {progress_info}\n"
elif progress_info.startswith("Successfully imported"):
status_text += f"\n## Completed\n- {progress_info}\n- Total pages processed: {pages_processed}"
else:
status_text += f"- {progress_info}\n"
yield gr.update(), gr.update(), status_text
status_text += "\n## Import Process Completed Successfully"
except Exception as e:
status_text += f"\n## Error\n- An error occurred during the import process: {str(e)}"
yield gr.update(visible=False), gr.update(visible=True), status_text
def start_import(*args):
nonlocal import_thread
import_thread = Thread(target=run_import, args=args)
import_thread.start()
return gr.update(visible=True), gr.update(visible=False), gr.update(
value="Import process started. Please wait...")
def cancel_import():
nonlocal cancel_flag
cancel_flag = True
return gr.update(visible=False), gr.update(visible=True)
import_button.click(
run_import,
inputs=[file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size,
chunk_overlap],
outputs=[cancel_button, import_button, output]
)
cancel_button.click(
cancel_import,
outputs=[cancel_button, import_button]
)
return file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, chunk_overlap, import_button, output
#
# End of MediaWiki Import Tab
#######################################################################################################################