# Media_Wiki.py # Description: This file contains the functions to import MediaWiki dumps into the media_db and Chroma databases. ####################################################################################################################### # # Imports import json import logging import os import re from typing import List, Dict, Any, Iterator, Optional # 3rd-Party Imports import mwparserfromhell import mwxml import yaml # # Local Imports from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, check_media_exists from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content # ####################################################################################################################### # # Functions: def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger: """Set up and return a logger with the given name and level.""" logger = logging.getLogger(name) logger.setLevel(level) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') if log_file: file_handler = logging.FileHandler(log_file) file_handler.setFormatter(formatter) logger.addHandler(file_handler) console_handler = logging.StreamHandler() console_handler.setFormatter(formatter) logger.addHandler(console_handler) return logger # Usage logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log') # Load configuration def load_mediawiki_import_config(): with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f: return yaml.safe_load(f) config = load_mediawiki_import_config() def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[ Dict[str, Any]]: dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8')) for page in dump.pages: if skip_redirects and page.redirect: continue if namespaces and page.namespace not in namespaces: continue for revision in page: code = mwparserfromhell.parse(revision.text) text = code.strip_code(normalize=True, collapse=True, keep_template_params=False) yield { "title": page.title, "content": text, "namespace": page.namespace, "page_id": page.id, "revision_id": revision.id, "timestamp": revision.timestamp } logger.debug(f"Yielded page: {page.title}") def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]: sections = re.split(r'\n==\s*(.*?)\s*==\n', text) chunks = [] current_chunk = "" current_size = 0 for i in range(0, len(sections), 2): section_title = sections[i] if i > 0 else "Introduction" section_content = sections[i + 1] if i + 1 < len(sections) else "" if current_size + len(section_content) > chunk_options['max_size']: if current_chunk: chunks.append({"text": current_chunk, "metadata": {"section": section_title}}) current_chunk = section_content current_size = len(section_content) else: current_chunk += f"\n== {section_title} ==\n" + section_content current_size += len(section_content) if current_chunk: chunks.append({"text": current_chunk, "metadata": {"section": "End"}}) return chunks def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any], is_combined: bool = False, item: Dict[str, Any] = None): try: url = f"mediawiki:{wiki_name}" if is_combined else f"mediawiki:{wiki_name}:{title}" if not check_media_exists(title, url): media_id = add_media_with_keywords( url=url, title=title, media_type="mediawiki_dump" if is_combined else "mediawiki_article", content=content, keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"), prompt="", summary="", transcription_model="", author="MediaWiki", ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None ) chunks = optimized_chunking(content, chunk_options) for chunk in chunks: process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title) logger.info(f"Successfully processed item: {title}") else: logger.info(f"Skipping existing article: {title}") except Exception as e: logger.error(f"Error processing item {title}: {str(e)}") def load_checkpoint(file_path: str) -> int: if os.path.exists(file_path): with open(file_path, 'r') as f: return json.load(f)['last_processed_id'] return 0 def save_checkpoint(file_path: str, last_processed_id: int): with open(file_path, 'w') as f: json.dump({'last_processed_id': last_processed_id}, f) def import_mediawiki_dump( file_path: str, wiki_name: str, namespaces: List[int] = None, skip_redirects: bool = False, chunk_options: Dict[str, Any] = None, single_item: bool = False, progress_callback: Any = None ) -> Iterator[str]: try: if chunk_options is None: chunk_options = config['chunking'] checkpoint_file = f"{wiki_name}_import_checkpoint.json" last_processed_id = load_checkpoint(checkpoint_file) total_pages = count_pages(file_path, namespaces, skip_redirects) processed_pages = 0 yield f"Found {total_pages} pages to process." for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects): if item['page_id'] <= last_processed_id: continue process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item) save_checkpoint(checkpoint_file, item['page_id']) processed_pages += 1 if progress_callback is not None: progress_callback(processed_pages / total_pages, f"Processed page: {item['title']}") yield f"Processed page {processed_pages}/{total_pages}: {item['title']}" os.remove(checkpoint_file) # Remove checkpoint file after successful import yield f"Successfully imported and indexed MediaWiki dump: {wiki_name}" except FileNotFoundError: logger.error(f"MediaWiki dump file not found: {file_path}") yield f"Error: File not found - {file_path}" except PermissionError: logger.error(f"Permission denied when trying to read: {file_path}") yield f"Error: Permission denied - {file_path}" except Exception as e: logger.exception(f"Error during MediaWiki import: {str(e)}") yield f"Error during import: {str(e)}" def count_pages(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> int: """ Count the number of pages in a MediaWiki XML dump file. Args: file_path (str): Path to the MediaWiki XML dump file. namespaces (List[int], optional): List of namespace IDs to include. If None, include all namespaces. skip_redirects (bool, optional): Whether to skip redirect pages. Returns: int: The number of pages in the dump file. """ try: dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8')) count = 0 for page in dump.pages: if skip_redirects and page.redirect: continue if namespaces and page.namespace not in namespaces: continue count += 1 return count except Exception as e: logger.error(f"Error counting pages in MediaWiki dump: {str(e)}") return 0 # # End of Media_Wiki.py #######################################################################################################################