|
MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/" |
|
LANCEDB_DIRECTORY = "lancedb" |
|
LANCEDB_TABLE_NAME = "table" |
|
VECTOR_COLUMN_NAME = "embedding" |
|
TEXT_COLUMN_NAME = "text" |
|
DOCUMENT_PATH_COLUMN_NAME = "document_path" |
|
|
|
CHUNK_POLICY = "md" |
|
|
|
|
|
EMBED_NAME = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
|
|
|
TOP_K_RANK = 50 |
|
TOP_K_RERANK = 5 |
|
|
|
emb_sizes = { |
|
"sentence-transformers/all-MiniLM-L6-v2": 384, |
|
"thenlper/gte-large": 1024, |
|
"text-embedding-ada-002": 1536, |
|
} |
|
|
|
thresh_distances = { |
|
"sentence-transformers/all-MiniLM-L6-v2": 1.2, |
|
"text-embedding-ada-002": 0.5, |
|
} |
|
|
|
context_lengths = { |
|
"mistralai/Mistral-7B-Instruct-v0.1": 4096, |
|
"tiiuae/falcon-180B-chat": 2048, |
|
"GeneZC/MiniChat-3B": 4096, |
|
"gpt-3.5-turbo": 4096, |
|
"sentence-transformers/all-MiniLM-L6-v2": 128, |
|
"thenlper/gte-large": 512, |
|
"text-embedding-ada-002": 1000, |
|
} |
|
|