Spaces:
Sleeping
Sleeping
File size: 1,434 Bytes
2544492 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# MediaWiki Import Configuration
# Database settings
database:
sqlite_path: './Databases/media_summary.db'
chroma_db_path: 'chroma_db'
# Chunking options
chunking:
default_method: 'sentences'
default_size: 1000
default_overlap: 100
adaptive: true
language: 'en'
methods:
- 'sentences'
- 'words'
- 'paragraphs'
- 'tokens'
# Import settings
import:
batch_size: 1000 # Number of pages to process in a single batch
default_skip_redirects: true
default_namespaces: [0] # Main namespace by default
single_item_default: false
# Processing options
processing:
max_workers: 4 # Number of worker threads for async processing
# Embedding settings
embeddings:
provider: 'openai' # or 'local' or 'huggingface'
model: 'text-embedding-ada-002'
api_key: 'your_openai_api_key_here' # Remove if using local embeddings
local_url: 'http://localhost:8080/embeddings' # Only for local embeddings
# ChromaDB settings
chromadb:
collection_prefix: 'mediawiki_'
# Logging settings
logging:
level: 'INFO'
file: 'mediawiki_import.log'
# Checkpoint settings
checkpoints:
enabled: true
directory: 'import_checkpoints'
# Error handling
error_handling:
max_retries: 3
retry_delay: 5 # seconds
# User interface settings
ui:
default_chunk_size: 1000
min_chunk_size: 100
max_chunk_size: 2000
default_chunk_overlap: 100 |