Spaces:
Running
on
T4
Running
on
T4
# MediaWiki Import Configuration | |
# Database settings | |
database: | |
sqlite_path: './Databases/media_summary.db' | |
chroma_db_path: 'chroma_db' | |
# Chunking options | |
chunking: | |
default_method: 'sentences' | |
default_size: 1000 | |
default_overlap: 100 | |
adaptive: true | |
language: 'en' | |
methods: | |
- 'sentences' | |
- 'words' | |
- 'paragraphs' | |
- 'tokens' | |
# Import settings | |
import: | |
batch_size: 1000 # Number of pages to process in a single batch | |
default_skip_redirects: true | |
default_namespaces: [0] # Main namespace by default | |
single_item_default: false | |
# Processing options | |
processing: | |
max_workers: 4 # Number of worker threads for async processing | |
# Embedding settings | |
embeddings: | |
provider: 'openai' # or 'local' or 'huggingface' | |
model: 'text-embedding-ada-002' | |
api_key: 'your_openai_api_key_here' # Remove if using local embeddings | |
local_url: 'http://localhost:8080/embeddings' # Only for local embeddings | |
# ChromaDB settings | |
chromadb: | |
collection_prefix: 'mediawiki_' | |
# Logging settings | |
logging: | |
level: 'INFO' | |
file: 'mediawiki_import.log' | |
# Checkpoint settings | |
checkpoints: | |
enabled: true | |
directory: 'import_checkpoints' | |
# Error handling | |
error_handling: | |
max_retries: 3 | |
retry_delay: 5 # seconds | |
# User interface settings | |
ui: | |
default_chunk_size: 1000 | |
min_chunk_size: 100 | |
max_chunk_size: 2000 | |
default_chunk_overlap: 100 |