Spaces:

oceansweep
/

tldw

Sleeping

App Files Files Community

oceansweep commited on Sep 11

Commit

2544492

•

1 Parent(s): 3372a32

Upload 2 files

Browse files

Files changed (2) hide show

Config_Files/config.txt +176 -0
Config_Files/mediawiki_import_config.yaml +63 -0

Config_Files/config.txt ADDED Viewed

	@@ -0,0 +1,176 @@

+[API]
+anthropic_api_key = <anthropic_API_Key>
+anthropic_model = claude-3-5-sonnet-20240620
+cohere_api_key = <Cohere_API_Key>
+cohere_model = command-r-plus
+groq_api_key = <Groq_API_Key>
+groq_model = llama3-70b-8192
+openai_api_key = <OpenAI_API_Key>
+openai_model = gpt-4o
+huggingface_api_key = <huggingface_api_token>
+huggingface_model = CohereForAI/c4ai-command-r-plus
+openrouter_api_key = <OpenRouter_API_Key>
+openrouter_model = mistralai/mistral-7b-instruct:free
+deepseek_api_key = <DeepSeek_API_Key>
+deepseek_model = deepseek-chat
+mistral_model = mistral-large-latest
+mistral_api_key = <mistral_api_key>
+custom_openai_api = <key_here>
+custom_openai_api_ip = <api_ip_here>
+[Local-API]
+kobold_api_key =
+kobold_api_IP = http://127.0.0.1:5001/api/v1/generate
+llama_api_key = <llama.cpp api key>
+llama_api_IP = http://127.0.0.1:8080/completion
+ooba_api_key = <ooba api key>
+ooba_api_IP = http://127.0.0.1:5000/v1/chat/completions
+tabby_api_IP = http://127.0.0.1:5000/v1/chat/completions
+tabby_api_key = <tabbyapi key>
+vllm_api_IP = http://127.0.0.1:8000/v1/chat/completions
+vllm_model = <vllm model>
+ollama_api_IP = http://127.0.0.1:11434/api/generate
+ollama_api_key = <ollama api key>
+ollama_model = <ollama model>
+aphrodite_api_IP = http://127.0.0.1:8080/completion
+aphrodite_api_key = <aphrodite_api_key>
+[Paths]
+output_path = Results
+logging_file = Logs
+[Processing]
+processing_choice = cuda
+# Can swap 'cuda' with 'cpu' if you want to use your CPU for processing
+[Settings]
+chunk_duration = 30
+words_per_second = 3
+[Prompts]
+prompt_sample = "What is the meaning of life?"
+video_summarize_prompt = "Above is the transcript of a video. Please read through the transcript carefully. Identify the main topics that are discussed over the course of the transcript. Then, summarize the key points about each main topic in bullet points. The bullet points should cover the key information conveyed about each topic in the video, but should be much shorter than the full transcript. Please output your bullet point summary inside <bulletpoints> tags. Do not repeat yourself while writing the summary."
+[Database]
+type = sqlite
+sqlite_path = /Databases/media_summary.db
+elasticsearch_host = localhost
+elasticsearch_port = 9200
+# Additionally you can use elasticsearch as the database type, just replace `sqlite` with `elasticsearch` for `type` and provide the `elasticsearch_host` and `elasticsearch_port` of your configured ES instance.
+chroma_db_path = chroma_db
+[Embeddings]
+provider = openai
+# Can be 'openai', 'local', or 'huggingface'
+model = text-embedding-3-small
+# Model name or path
+api_key = your_api_key_here
+api_url = http://localhost:8080/v1/embeddings
+# Only needed for 'local' provider
+[Chunking]
+method = words
+max_size = 400
+overlap = 200
+adaptive = false
+multi_level = false
+language = english
+#[Comments]
+#OpenAI Models:
+#    f
+#Anthropic Models:
+#    f
+#Cohere Models:
+#    f
+#DeepSeek Models:
+#    f
+#Groq Models:
+#    f
+#Mistral Models:
+#    mistral-large-latest
+#    open-mistral-nemo
+#    codestral-latest
+#    mistral-embed
+#    open-mistral-7b
+#    open-mixtral-8x7b
+#    open-mixtral-8x22b
+#    open-codestral-mamba
+[API]
+anthropic_api_key = <anthropic_api_key>
+anthropic_model = claude-3-sonnet-20240229
+cohere_api_key = <your_cohere_api_key>
+cohere_model = command-r-plus
+groq_api_key = <your_groq_api_key>
+groq_model = llama3-70b-8192
+openai_api_key = <openai_api_key>
+openai_model = gpt-4-turbo
+huggingface_api_token = <huggingface_api_token>
+huggingface_model = CohereForAI/c4ai-command-r-plus
+openrouter_api_key = <openrouter_api_key>
+openrouter_model = mistralai/mistral-7b-instruct:free
+deepseek_api_key = <deepseek_api_key>
+deepseek_model = deepseek-chat
+[Local-API]
+kobold_api_key = <kobold api key>
+kobold_api_IP = http://127.0.0.1:5001/api/v1/generate
+llama_api_key = <llama.cpp api key>
+llama_api_IP = http://127.0.0.1:8080/completion
+ooba_api_key = <ooba api key>
+ooba_api_IP = http://127.0.0.1:5000/v1/chat/completions
+tabby_api_IP = http://127.0.0.1:5000/v1/chat/completions
+tabby_api_key = <tabbyapi key>
+vllm_api_IP = http://127.0.0.1:8000/v1/chat/completions
+vllm_model = <vllm model>
+ollama_api_IP = http://127.0.0.1:11434/api/generate
+ollama_api_key = <ollama api key>
+ollama_model = <ollama model>
+[Paths]
+output_path = Results
+logging_file = Logs
+[Processing]
+processing_choice = cuda
+[Settings]
+chunk_duration = 30
+words_per_second = 3
+[Prompts]
+prompt_sample = "What is the meaning of life?"
+video_summarize_prompt = "Above is the transcript of a video. Please read through the transcript carefully. Identify the main topics that are discussed over the course of the transcript. Then, summarize the key points about each main topic in bullet points. The bullet points should cover the key information conveyed about each topic in the video, but should be much shorter than the full transcript. Please output your bullet point summary inside <bulletpoints> tags. Do not repeat yourself while writing the summary."
+[Database]
+type = sqlite
+sqlite_path = /Databases/media_summary.db
+backup_path = /tldw_DB_Backups/
+#Path to the backup location for the database. If the path does not exist, the backup will not be created.
+elasticsearch_host = localhost
+elasticsearch_port = 9200
+# Additionally you can use elasticsearch as the database type, just replace `sqlite` with `elasticsearch` for `type` and provide the `elasticsearch_host` and `elasticsearch_port` of your configured ES instance.
+chroma_db_path = chroma_db
+[Embeddings]
+provider = openai
+# Can be 'openai', 'local', or 'huggingface'
+model = text-embedding-3-small
+# Model name or path
+api_key = your_api_key_here
+api_url = http://localhost:8080/v1/embeddings
+# Only needed for 'local' provider
+[Chunking]
+method = words
+max_size = 400
+overlap = 200
+adaptive = false
+multi_level = false
+language = english

Config_Files/mediawiki_import_config.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+# MediaWiki Import Configuration
+# Database settings
+database:
+  sqlite_path: './Databases/media_summary.db'
+  chroma_db_path: 'chroma_db'
+# Chunking options
+chunking:
+  default_method: 'sentences'
+  default_size: 1000
+  default_overlap: 100
+  adaptive: true
+  language: 'en'
+  methods:
+    - 'sentences'
+    - 'words'
+    - 'paragraphs'
+    - 'tokens'
+# Import settings
+import:
+  batch_size: 1000  # Number of pages to process in a single batch
+  default_skip_redirects: true
+  default_namespaces: [0]  # Main namespace by default
+  single_item_default: false
+# Processing options
+processing:
+  max_workers: 4  # Number of worker threads for async processing
+# Embedding settings
+embeddings:
+  provider: 'openai'  # or 'local' or 'huggingface'
+  model: 'text-embedding-ada-002'
+  api_key: 'your_openai_api_key_here'  # Remove if using local embeddings
+  local_url: 'http://localhost:8080/embeddings'  # Only for local embeddings
+# ChromaDB settings
+chromadb:
+  collection_prefix: 'mediawiki_'
+# Logging settings
+logging:
+  level: 'INFO'
+  file: 'mediawiki_import.log'
+# Checkpoint settings
+checkpoints:
+  enabled: true
+  directory: 'import_checkpoints'
+# Error handling
+error_handling:
+  max_retries: 3
+  retry_delay: 5  # seconds
+# User interface settings
+ui:
+  default_chunk_size: 1000
+  min_chunk_size: 100
+  max_chunk_size: 2000
+  default_chunk_overlap: 100