Spaces:
Sleeping
Sleeping
oceansweep
commited on
Commit
•
2544492
1
Parent(s):
3372a32
Upload 2 files
Browse files- Config_Files/config.txt +176 -0
- Config_Files/mediawiki_import_config.yaml +63 -0
Config_Files/config.txt
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[API]
|
2 |
+
anthropic_api_key = <anthropic_API_Key>
|
3 |
+
anthropic_model = claude-3-5-sonnet-20240620
|
4 |
+
cohere_api_key = <Cohere_API_Key>
|
5 |
+
cohere_model = command-r-plus
|
6 |
+
groq_api_key = <Groq_API_Key>
|
7 |
+
groq_model = llama3-70b-8192
|
8 |
+
openai_api_key = <OpenAI_API_Key>
|
9 |
+
openai_model = gpt-4o
|
10 |
+
huggingface_api_key = <huggingface_api_token>
|
11 |
+
huggingface_model = CohereForAI/c4ai-command-r-plus
|
12 |
+
openrouter_api_key = <OpenRouter_API_Key>
|
13 |
+
openrouter_model = mistralai/mistral-7b-instruct:free
|
14 |
+
deepseek_api_key = <DeepSeek_API_Key>
|
15 |
+
deepseek_model = deepseek-chat
|
16 |
+
mistral_model = mistral-large-latest
|
17 |
+
mistral_api_key = <mistral_api_key>
|
18 |
+
custom_openai_api = <key_here>
|
19 |
+
custom_openai_api_ip = <api_ip_here>
|
20 |
+
|
21 |
+
[Local-API]
|
22 |
+
kobold_api_key =
|
23 |
+
kobold_api_IP = http://127.0.0.1:5001/api/v1/generate
|
24 |
+
llama_api_key = <llama.cpp api key>
|
25 |
+
llama_api_IP = http://127.0.0.1:8080/completion
|
26 |
+
ooba_api_key = <ooba api key>
|
27 |
+
ooba_api_IP = http://127.0.0.1:5000/v1/chat/completions
|
28 |
+
tabby_api_IP = http://127.0.0.1:5000/v1/chat/completions
|
29 |
+
tabby_api_key = <tabbyapi key>
|
30 |
+
vllm_api_IP = http://127.0.0.1:8000/v1/chat/completions
|
31 |
+
vllm_model = <vllm model>
|
32 |
+
ollama_api_IP = http://127.0.0.1:11434/api/generate
|
33 |
+
ollama_api_key = <ollama api key>
|
34 |
+
ollama_model = <ollama model>
|
35 |
+
aphrodite_api_IP = http://127.0.0.1:8080/completion
|
36 |
+
aphrodite_api_key = <aphrodite_api_key>
|
37 |
+
|
38 |
+
[Paths]
|
39 |
+
output_path = Results
|
40 |
+
logging_file = Logs
|
41 |
+
|
42 |
+
[Processing]
|
43 |
+
processing_choice = cuda
|
44 |
+
# Can swap 'cuda' with 'cpu' if you want to use your CPU for processing
|
45 |
+
|
46 |
+
[Settings]
|
47 |
+
chunk_duration = 30
|
48 |
+
words_per_second = 3
|
49 |
+
|
50 |
+
[Prompts]
|
51 |
+
prompt_sample = "What is the meaning of life?"
|
52 |
+
video_summarize_prompt = "Above is the transcript of a video. Please read through the transcript carefully. Identify the main topics that are discussed over the course of the transcript. Then, summarize the key points about each main topic in bullet points. The bullet points should cover the key information conveyed about each topic in the video, but should be much shorter than the full transcript. Please output your bullet point summary inside <bulletpoints> tags. Do not repeat yourself while writing the summary."
|
53 |
+
|
54 |
+
[Database]
|
55 |
+
type = sqlite
|
56 |
+
sqlite_path = /Databases/media_summary.db
|
57 |
+
elasticsearch_host = localhost
|
58 |
+
elasticsearch_port = 9200
|
59 |
+
# Additionally you can use elasticsearch as the database type, just replace `sqlite` with `elasticsearch` for `type` and provide the `elasticsearch_host` and `elasticsearch_port` of your configured ES instance.
|
60 |
+
chroma_db_path = chroma_db
|
61 |
+
|
62 |
+
[Embeddings]
|
63 |
+
provider = openai
|
64 |
+
# Can be 'openai', 'local', or 'huggingface'
|
65 |
+
model = text-embedding-3-small
|
66 |
+
# Model name or path
|
67 |
+
api_key = your_api_key_here
|
68 |
+
api_url = http://localhost:8080/v1/embeddings
|
69 |
+
# Only needed for 'local' provider
|
70 |
+
|
71 |
+
[Chunking]
|
72 |
+
method = words
|
73 |
+
max_size = 400
|
74 |
+
overlap = 200
|
75 |
+
adaptive = false
|
76 |
+
multi_level = false
|
77 |
+
language = english
|
78 |
+
|
79 |
+
#[Comments]
|
80 |
+
#OpenAI Models:
|
81 |
+
# f
|
82 |
+
#Anthropic Models:
|
83 |
+
# f
|
84 |
+
#Cohere Models:
|
85 |
+
# f
|
86 |
+
#DeepSeek Models:
|
87 |
+
# f
|
88 |
+
#Groq Models:
|
89 |
+
# f
|
90 |
+
#Mistral Models:
|
91 |
+
# mistral-large-latest
|
92 |
+
# open-mistral-nemo
|
93 |
+
# codestral-latest
|
94 |
+
# mistral-embed
|
95 |
+
# open-mistral-7b
|
96 |
+
# open-mixtral-8x7b
|
97 |
+
# open-mixtral-8x22b
|
98 |
+
# open-codestral-mamba
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
[API]
|
106 |
+
anthropic_api_key = <anthropic_api_key>
|
107 |
+
anthropic_model = claude-3-sonnet-20240229
|
108 |
+
cohere_api_key = <your_cohere_api_key>
|
109 |
+
cohere_model = command-r-plus
|
110 |
+
groq_api_key = <your_groq_api_key>
|
111 |
+
groq_model = llama3-70b-8192
|
112 |
+
openai_api_key = <openai_api_key>
|
113 |
+
openai_model = gpt-4-turbo
|
114 |
+
huggingface_api_token = <huggingface_api_token>
|
115 |
+
huggingface_model = CohereForAI/c4ai-command-r-plus
|
116 |
+
openrouter_api_key = <openrouter_api_key>
|
117 |
+
openrouter_model = mistralai/mistral-7b-instruct:free
|
118 |
+
deepseek_api_key = <deepseek_api_key>
|
119 |
+
deepseek_model = deepseek-chat
|
120 |
+
|
121 |
+
[Local-API]
|
122 |
+
kobold_api_key = <kobold api key>
|
123 |
+
kobold_api_IP = http://127.0.0.1:5001/api/v1/generate
|
124 |
+
llama_api_key = <llama.cpp api key>
|
125 |
+
llama_api_IP = http://127.0.0.1:8080/completion
|
126 |
+
ooba_api_key = <ooba api key>
|
127 |
+
ooba_api_IP = http://127.0.0.1:5000/v1/chat/completions
|
128 |
+
tabby_api_IP = http://127.0.0.1:5000/v1/chat/completions
|
129 |
+
tabby_api_key = <tabbyapi key>
|
130 |
+
vllm_api_IP = http://127.0.0.1:8000/v1/chat/completions
|
131 |
+
vllm_model = <vllm model>
|
132 |
+
ollama_api_IP = http://127.0.0.1:11434/api/generate
|
133 |
+
ollama_api_key = <ollama api key>
|
134 |
+
ollama_model = <ollama model>
|
135 |
+
|
136 |
+
[Paths]
|
137 |
+
output_path = Results
|
138 |
+
logging_file = Logs
|
139 |
+
|
140 |
+
[Processing]
|
141 |
+
processing_choice = cuda
|
142 |
+
|
143 |
+
[Settings]
|
144 |
+
chunk_duration = 30
|
145 |
+
words_per_second = 3
|
146 |
+
|
147 |
+
[Prompts]
|
148 |
+
prompt_sample = "What is the meaning of life?"
|
149 |
+
video_summarize_prompt = "Above is the transcript of a video. Please read through the transcript carefully. Identify the main topics that are discussed over the course of the transcript. Then, summarize the key points about each main topic in bullet points. The bullet points should cover the key information conveyed about each topic in the video, but should be much shorter than the full transcript. Please output your bullet point summary inside <bulletpoints> tags. Do not repeat yourself while writing the summary."
|
150 |
+
|
151 |
+
[Database]
|
152 |
+
type = sqlite
|
153 |
+
sqlite_path = /Databases/media_summary.db
|
154 |
+
backup_path = /tldw_DB_Backups/
|
155 |
+
#Path to the backup location for the database. If the path does not exist, the backup will not be created.
|
156 |
+
elasticsearch_host = localhost
|
157 |
+
elasticsearch_port = 9200
|
158 |
+
# Additionally you can use elasticsearch as the database type, just replace `sqlite` with `elasticsearch` for `type` and provide the `elasticsearch_host` and `elasticsearch_port` of your configured ES instance.
|
159 |
+
chroma_db_path = chroma_db
|
160 |
+
|
161 |
+
[Embeddings]
|
162 |
+
provider = openai
|
163 |
+
# Can be 'openai', 'local', or 'huggingface'
|
164 |
+
model = text-embedding-3-small
|
165 |
+
# Model name or path
|
166 |
+
api_key = your_api_key_here
|
167 |
+
api_url = http://localhost:8080/v1/embeddings
|
168 |
+
# Only needed for 'local' provider
|
169 |
+
|
170 |
+
[Chunking]
|
171 |
+
method = words
|
172 |
+
max_size = 400
|
173 |
+
overlap = 200
|
174 |
+
adaptive = false
|
175 |
+
multi_level = false
|
176 |
+
language = english
|
Config_Files/mediawiki_import_config.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MediaWiki Import Configuration
|
2 |
+
|
3 |
+
# Database settings
|
4 |
+
database:
|
5 |
+
sqlite_path: './Databases/media_summary.db'
|
6 |
+
chroma_db_path: 'chroma_db'
|
7 |
+
|
8 |
+
# Chunking options
|
9 |
+
chunking:
|
10 |
+
default_method: 'sentences'
|
11 |
+
default_size: 1000
|
12 |
+
default_overlap: 100
|
13 |
+
adaptive: true
|
14 |
+
language: 'en'
|
15 |
+
methods:
|
16 |
+
- 'sentences'
|
17 |
+
- 'words'
|
18 |
+
- 'paragraphs'
|
19 |
+
- 'tokens'
|
20 |
+
|
21 |
+
# Import settings
|
22 |
+
import:
|
23 |
+
batch_size: 1000 # Number of pages to process in a single batch
|
24 |
+
default_skip_redirects: true
|
25 |
+
default_namespaces: [0] # Main namespace by default
|
26 |
+
single_item_default: false
|
27 |
+
|
28 |
+
# Processing options
|
29 |
+
processing:
|
30 |
+
max_workers: 4 # Number of worker threads for async processing
|
31 |
+
|
32 |
+
# Embedding settings
|
33 |
+
embeddings:
|
34 |
+
provider: 'openai' # or 'local' or 'huggingface'
|
35 |
+
model: 'text-embedding-ada-002'
|
36 |
+
api_key: 'your_openai_api_key_here' # Remove if using local embeddings
|
37 |
+
local_url: 'http://localhost:8080/embeddings' # Only for local embeddings
|
38 |
+
|
39 |
+
# ChromaDB settings
|
40 |
+
chromadb:
|
41 |
+
collection_prefix: 'mediawiki_'
|
42 |
+
|
43 |
+
# Logging settings
|
44 |
+
logging:
|
45 |
+
level: 'INFO'
|
46 |
+
file: 'mediawiki_import.log'
|
47 |
+
|
48 |
+
# Checkpoint settings
|
49 |
+
checkpoints:
|
50 |
+
enabled: true
|
51 |
+
directory: 'import_checkpoints'
|
52 |
+
|
53 |
+
# Error handling
|
54 |
+
error_handling:
|
55 |
+
max_retries: 3
|
56 |
+
retry_delay: 5 # seconds
|
57 |
+
|
58 |
+
# User interface settings
|
59 |
+
ui:
|
60 |
+
default_chunk_size: 1000
|
61 |
+
min_chunk_size: 100
|
62 |
+
max_chunk_size: 2000
|
63 |
+
default_chunk_overlap: 100
|