oceansweep commited on
Commit
2544492
1 Parent(s): 3372a32

Upload 2 files

Browse files
Config_Files/config.txt ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [API]
2
+ anthropic_api_key = <anthropic_API_Key>
3
+ anthropic_model = claude-3-5-sonnet-20240620
4
+ cohere_api_key = <Cohere_API_Key>
5
+ cohere_model = command-r-plus
6
+ groq_api_key = <Groq_API_Key>
7
+ groq_model = llama3-70b-8192
8
+ openai_api_key = <OpenAI_API_Key>
9
+ openai_model = gpt-4o
10
+ huggingface_api_key = <huggingface_api_token>
11
+ huggingface_model = CohereForAI/c4ai-command-r-plus
12
+ openrouter_api_key = <OpenRouter_API_Key>
13
+ openrouter_model = mistralai/mistral-7b-instruct:free
14
+ deepseek_api_key = <DeepSeek_API_Key>
15
+ deepseek_model = deepseek-chat
16
+ mistral_model = mistral-large-latest
17
+ mistral_api_key = <mistral_api_key>
18
+ custom_openai_api = <key_here>
19
+ custom_openai_api_ip = <api_ip_here>
20
+
21
+ [Local-API]
22
+ kobold_api_key =
23
+ kobold_api_IP = http://127.0.0.1:5001/api/v1/generate
24
+ llama_api_key = <llama.cpp api key>
25
+ llama_api_IP = http://127.0.0.1:8080/completion
26
+ ooba_api_key = <ooba api key>
27
+ ooba_api_IP = http://127.0.0.1:5000/v1/chat/completions
28
+ tabby_api_IP = http://127.0.0.1:5000/v1/chat/completions
29
+ tabby_api_key = <tabbyapi key>
30
+ vllm_api_IP = http://127.0.0.1:8000/v1/chat/completions
31
+ vllm_model = <vllm model>
32
+ ollama_api_IP = http://127.0.0.1:11434/api/generate
33
+ ollama_api_key = <ollama api key>
34
+ ollama_model = <ollama model>
35
+ aphrodite_api_IP = http://127.0.0.1:8080/completion
36
+ aphrodite_api_key = <aphrodite_api_key>
37
+
38
+ [Paths]
39
+ output_path = Results
40
+ logging_file = Logs
41
+
42
+ [Processing]
43
+ processing_choice = cuda
44
+ # Can swap 'cuda' with 'cpu' if you want to use your CPU for processing
45
+
46
+ [Settings]
47
+ chunk_duration = 30
48
+ words_per_second = 3
49
+
50
+ [Prompts]
51
+ prompt_sample = "What is the meaning of life?"
52
+ video_summarize_prompt = "Above is the transcript of a video. Please read through the transcript carefully. Identify the main topics that are discussed over the course of the transcript. Then, summarize the key points about each main topic in bullet points. The bullet points should cover the key information conveyed about each topic in the video, but should be much shorter than the full transcript. Please output your bullet point summary inside <bulletpoints> tags. Do not repeat yourself while writing the summary."
53
+
54
+ [Database]
55
+ type = sqlite
56
+ sqlite_path = /Databases/media_summary.db
57
+ elasticsearch_host = localhost
58
+ elasticsearch_port = 9200
59
+ # Additionally you can use elasticsearch as the database type, just replace `sqlite` with `elasticsearch` for `type` and provide the `elasticsearch_host` and `elasticsearch_port` of your configured ES instance.
60
+ chroma_db_path = chroma_db
61
+
62
+ [Embeddings]
63
+ provider = openai
64
+ # Can be 'openai', 'local', or 'huggingface'
65
+ model = text-embedding-3-small
66
+ # Model name or path
67
+ api_key = your_api_key_here
68
+ api_url = http://localhost:8080/v1/embeddings
69
+ # Only needed for 'local' provider
70
+
71
+ [Chunking]
72
+ method = words
73
+ max_size = 400
74
+ overlap = 200
75
+ adaptive = false
76
+ multi_level = false
77
+ language = english
78
+
79
+ #[Comments]
80
+ #OpenAI Models:
81
+ # f
82
+ #Anthropic Models:
83
+ # f
84
+ #Cohere Models:
85
+ # f
86
+ #DeepSeek Models:
87
+ # f
88
+ #Groq Models:
89
+ # f
90
+ #Mistral Models:
91
+ # mistral-large-latest
92
+ # open-mistral-nemo
93
+ # codestral-latest
94
+ # mistral-embed
95
+ # open-mistral-7b
96
+ # open-mixtral-8x7b
97
+ # open-mixtral-8x22b
98
+ # open-codestral-mamba
99
+
100
+
101
+
102
+
103
+
104
+
105
+ [API]
106
+ anthropic_api_key = <anthropic_api_key>
107
+ anthropic_model = claude-3-sonnet-20240229
108
+ cohere_api_key = <your_cohere_api_key>
109
+ cohere_model = command-r-plus
110
+ groq_api_key = <your_groq_api_key>
111
+ groq_model = llama3-70b-8192
112
+ openai_api_key = <openai_api_key>
113
+ openai_model = gpt-4-turbo
114
+ huggingface_api_token = <huggingface_api_token>
115
+ huggingface_model = CohereForAI/c4ai-command-r-plus
116
+ openrouter_api_key = <openrouter_api_key>
117
+ openrouter_model = mistralai/mistral-7b-instruct:free
118
+ deepseek_api_key = <deepseek_api_key>
119
+ deepseek_model = deepseek-chat
120
+
121
+ [Local-API]
122
+ kobold_api_key = <kobold api key>
123
+ kobold_api_IP = http://127.0.0.1:5001/api/v1/generate
124
+ llama_api_key = <llama.cpp api key>
125
+ llama_api_IP = http://127.0.0.1:8080/completion
126
+ ooba_api_key = <ooba api key>
127
+ ooba_api_IP = http://127.0.0.1:5000/v1/chat/completions
128
+ tabby_api_IP = http://127.0.0.1:5000/v1/chat/completions
129
+ tabby_api_key = <tabbyapi key>
130
+ vllm_api_IP = http://127.0.0.1:8000/v1/chat/completions
131
+ vllm_model = <vllm model>
132
+ ollama_api_IP = http://127.0.0.1:11434/api/generate
133
+ ollama_api_key = <ollama api key>
134
+ ollama_model = <ollama model>
135
+
136
+ [Paths]
137
+ output_path = Results
138
+ logging_file = Logs
139
+
140
+ [Processing]
141
+ processing_choice = cuda
142
+
143
+ [Settings]
144
+ chunk_duration = 30
145
+ words_per_second = 3
146
+
147
+ [Prompts]
148
+ prompt_sample = "What is the meaning of life?"
149
+ video_summarize_prompt = "Above is the transcript of a video. Please read through the transcript carefully. Identify the main topics that are discussed over the course of the transcript. Then, summarize the key points about each main topic in bullet points. The bullet points should cover the key information conveyed about each topic in the video, but should be much shorter than the full transcript. Please output your bullet point summary inside <bulletpoints> tags. Do not repeat yourself while writing the summary."
150
+
151
+ [Database]
152
+ type = sqlite
153
+ sqlite_path = /Databases/media_summary.db
154
+ backup_path = /tldw_DB_Backups/
155
+ #Path to the backup location for the database. If the path does not exist, the backup will not be created.
156
+ elasticsearch_host = localhost
157
+ elasticsearch_port = 9200
158
+ # Additionally you can use elasticsearch as the database type, just replace `sqlite` with `elasticsearch` for `type` and provide the `elasticsearch_host` and `elasticsearch_port` of your configured ES instance.
159
+ chroma_db_path = chroma_db
160
+
161
+ [Embeddings]
162
+ provider = openai
163
+ # Can be 'openai', 'local', or 'huggingface'
164
+ model = text-embedding-3-small
165
+ # Model name or path
166
+ api_key = your_api_key_here
167
+ api_url = http://localhost:8080/v1/embeddings
168
+ # Only needed for 'local' provider
169
+
170
+ [Chunking]
171
+ method = words
172
+ max_size = 400
173
+ overlap = 200
174
+ adaptive = false
175
+ multi_level = false
176
+ language = english
Config_Files/mediawiki_import_config.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MediaWiki Import Configuration
2
+
3
+ # Database settings
4
+ database:
5
+ sqlite_path: './Databases/media_summary.db'
6
+ chroma_db_path: 'chroma_db'
7
+
8
+ # Chunking options
9
+ chunking:
10
+ default_method: 'sentences'
11
+ default_size: 1000
12
+ default_overlap: 100
13
+ adaptive: true
14
+ language: 'en'
15
+ methods:
16
+ - 'sentences'
17
+ - 'words'
18
+ - 'paragraphs'
19
+ - 'tokens'
20
+
21
+ # Import settings
22
+ import:
23
+ batch_size: 1000 # Number of pages to process in a single batch
24
+ default_skip_redirects: true
25
+ default_namespaces: [0] # Main namespace by default
26
+ single_item_default: false
27
+
28
+ # Processing options
29
+ processing:
30
+ max_workers: 4 # Number of worker threads for async processing
31
+
32
+ # Embedding settings
33
+ embeddings:
34
+ provider: 'openai' # or 'local' or 'huggingface'
35
+ model: 'text-embedding-ada-002'
36
+ api_key: 'your_openai_api_key_here' # Remove if using local embeddings
37
+ local_url: 'http://localhost:8080/embeddings' # Only for local embeddings
38
+
39
+ # ChromaDB settings
40
+ chromadb:
41
+ collection_prefix: 'mediawiki_'
42
+
43
+ # Logging settings
44
+ logging:
45
+ level: 'INFO'
46
+ file: 'mediawiki_import.log'
47
+
48
+ # Checkpoint settings
49
+ checkpoints:
50
+ enabled: true
51
+ directory: 'import_checkpoints'
52
+
53
+ # Error handling
54
+ error_handling:
55
+ max_retries: 3
56
+ retry_delay: 5 # seconds
57
+
58
+ # User interface settings
59
+ ui:
60
+ default_chunk_size: 1000
61
+ min_chunk_size: 100
62
+ max_chunk_size: 2000
63
+ default_chunk_overlap: 100