Spaces:
Sleeping
Sleeping
ray
commited on
Commit
·
9021b39
1
Parent(s):
61ec090
v2 - manually split knowledge units
Browse files- .gitignore +4 -0
- app.py +13 -6
- chatbot.py +8 -8
- custom_io.py +45 -0
- scripts/convert_docx_to_md.sh +37 -0
.gitignore
CHANGED
@@ -1,3 +1,7 @@
|
|
1 |
.env
|
2 |
**/__pycache__
|
3 |
awesumcare_data
|
|
|
|
|
|
|
|
|
|
1 |
.env
|
2 |
**/__pycache__
|
3 |
awesumcare_data
|
4 |
+
TestData
|
5 |
+
logs
|
6 |
+
wandb
|
7 |
+
streamlit_chatbot_pack
|
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import openai
|
3 |
import os
|
@@ -14,7 +15,7 @@ from llama_index.ingestion import IngestionPipeline
|
|
14 |
from chat_template import CHAT_TEXT_QA_PROMPT
|
15 |
from schemas import ChatbotVersion, ServiceProvider
|
16 |
from chatbot import Chatbot, IndexBuilder
|
17 |
-
from custom_io import UnstructuredReader, default_file_metadata_func
|
18 |
from qdrant import client as qdrantClient
|
19 |
from llama_index import set_global_service_context
|
20 |
|
@@ -28,11 +29,11 @@ llama_index.set_global_handler("arize_phoenix")
|
|
28 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
29 |
|
30 |
IS_LOAD_FROM_VECTOR_STORE = True
|
31 |
-
VDB_COLLECTION_NAME = "demo-
|
32 |
MODEL_NAME = ChatbotVersion.CHATGPT_4.value
|
33 |
|
34 |
|
35 |
-
CHUNK_SIZE =
|
36 |
LLM, EMBED_MODEL = get_service_provider_config(
|
37 |
service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
|
38 |
service_context = ServiceContext.from_defaults(
|
@@ -45,13 +46,19 @@ set_global_service_context(service_context)
|
|
45 |
|
46 |
class AwesumIndexBuilder(IndexBuilder):
|
47 |
def _load_doucments(self):
|
48 |
-
|
|
|
|
|
|
|
|
|
49 |
".pdf": UnstructuredReader(),
|
50 |
".docx": UnstructuredReader(),
|
51 |
".pptx": UnstructuredReader(),
|
|
|
52 |
},
|
53 |
recursive=True,
|
54 |
-
|
|
|
55 |
file_metadata=default_file_metadata_func)
|
56 |
|
57 |
self.documents = dir_reader.load_data()
|
@@ -73,7 +80,7 @@ class AwesumIndexBuilder(IndexBuilder):
|
|
73 |
return
|
74 |
pipeline = IngestionPipeline(
|
75 |
transformations=[
|
76 |
-
SentenceSplitter(),
|
77 |
self.embed_model,
|
78 |
],
|
79 |
vector_store=self.vector_store,
|
|
|
1 |
+
import glob
|
2 |
import gradio as gr
|
3 |
import openai
|
4 |
import os
|
|
|
15 |
from chat_template import CHAT_TEXT_QA_PROMPT
|
16 |
from schemas import ChatbotVersion, ServiceProvider
|
17 |
from chatbot import Chatbot, IndexBuilder
|
18 |
+
from custom_io import MarkdownReader, UnstructuredReader, default_file_metadata_func
|
19 |
from qdrant import client as qdrantClient
|
20 |
from llama_index import set_global_service_context
|
21 |
|
|
|
29 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
30 |
|
31 |
IS_LOAD_FROM_VECTOR_STORE = True
|
32 |
+
VDB_COLLECTION_NAME = "demo-v1"
|
33 |
MODEL_NAME = ChatbotVersion.CHATGPT_4.value
|
34 |
|
35 |
|
36 |
+
CHUNK_SIZE = 8191
|
37 |
LLM, EMBED_MODEL = get_service_provider_config(
|
38 |
service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
|
39 |
service_context = ServiceContext.from_defaults(
|
|
|
46 |
|
47 |
class AwesumIndexBuilder(IndexBuilder):
|
48 |
def _load_doucments(self):
|
49 |
+
directory = "./awesumcare_data/awesumcare_manual_data"
|
50 |
+
# all_files = glob.glob(os.path.join(directory, '*.md'))
|
51 |
+
# faq_files = [f for f in all_files if 'FAQ' in os.path.basename(f)]
|
52 |
+
# print(faq_files)
|
53 |
+
dir_reader = SimpleDirectoryReader(directory, file_extractor={
|
54 |
".pdf": UnstructuredReader(),
|
55 |
".docx": UnstructuredReader(),
|
56 |
".pptx": UnstructuredReader(),
|
57 |
+
".md": MarkdownReader()
|
58 |
},
|
59 |
recursive=True,
|
60 |
+
# input_files=faq_files,
|
61 |
+
exclude=["*.png", "*.pptx", "*.docx", "*.pdf"],
|
62 |
file_metadata=default_file_metadata_func)
|
63 |
|
64 |
self.documents = dir_reader.load_data()
|
|
|
80 |
return
|
81 |
pipeline = IngestionPipeline(
|
82 |
transformations=[
|
83 |
+
# SentenceSplitter(),
|
84 |
self.embed_model,
|
85 |
],
|
86 |
vector_store=self.vector_store,
|
chatbot.py
CHANGED
@@ -126,14 +126,14 @@ class Chatbot:
|
|
126 |
partial_message += token
|
127 |
yield partial_message
|
128 |
|
129 |
-
urls = [source.node.metadata.get(
|
130 |
-
|
131 |
-
if urls:
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
|
138 |
def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
|
139 |
chat_messages = [ChatMessage(
|
|
|
126 |
partial_message += token
|
127 |
yield partial_message
|
128 |
|
129 |
+
# urls = [source.node.metadata.get(
|
130 |
+
# "file_name") for source in response.source_nodes if source.score >= 0.78 and source.node.metadata.get("file_name")]
|
131 |
+
# if urls:
|
132 |
+
# urls = list(set(urls))
|
133 |
+
# url_section = "\n \n\n---\n\n參考: \n" + \
|
134 |
+
# "\n".join(f"- {url}" for url in urls)
|
135 |
+
# partial_message += url_section
|
136 |
+
# yield partial_message
|
137 |
|
138 |
def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
|
139 |
chat_messages = [ChatMessage(
|
custom_io.py
CHANGED
@@ -50,6 +50,51 @@ class UnstructuredReader(BaseReader):
|
|
50 |
]
|
51 |
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def default_file_metadata_func(file_path: str) -> Dict:
|
54 |
"""Get some handy metadate from filesystem.
|
55 |
|
|
|
50 |
]
|
51 |
|
52 |
|
53 |
+
class MarkdownReader(BaseReader):
|
54 |
+
"""General unstructured text reader for a variety of files."""
|
55 |
+
|
56 |
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
57 |
+
"""Init params."""
|
58 |
+
super().__init__(*args, **kwargs)
|
59 |
+
|
60 |
+
def load_data(
|
61 |
+
self,
|
62 |
+
file: Path,
|
63 |
+
extra_info: Optional[Dict] = None,
|
64 |
+
split_documents: Optional[bool] = True,
|
65 |
+
) -> List[Document]:
|
66 |
+
"""Parse file."""
|
67 |
+
from unstructured.partition.auto import partition
|
68 |
+
|
69 |
+
elements = parse_knowledge_units(str(file))
|
70 |
+
|
71 |
+
if split_documents:
|
72 |
+
return [
|
73 |
+
Document(text=ele, extra_info=extra_info or {})
|
74 |
+
for ele in elements
|
75 |
+
]
|
76 |
+
|
77 |
+
def parse_knowledge_units(file_path):
|
78 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
79 |
+
lines = file.readlines()
|
80 |
+
|
81 |
+
knowledge_units = []
|
82 |
+
current_unit = ""
|
83 |
+
|
84 |
+
for line in lines:
|
85 |
+
if line.strip() and line[0].isdigit() and '.' in line:
|
86 |
+
if current_unit:
|
87 |
+
knowledge_units.append(current_unit.strip())
|
88 |
+
current_unit = ""
|
89 |
+
current_unit += line
|
90 |
+
else:
|
91 |
+
current_unit += line
|
92 |
+
|
93 |
+
if current_unit:
|
94 |
+
knowledge_units.append(current_unit.strip())
|
95 |
+
|
96 |
+
return knowledge_units
|
97 |
+
|
98 |
def default_file_metadata_func(file_path: str) -> Dict:
|
99 |
"""Get some handy metadate from filesystem.
|
100 |
|
scripts/convert_docx_to_md.sh
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Check if a directory path is provided
|
4 |
+
if [ "$#" -ne 1 ]; then
|
5 |
+
echo "Usage: $0 <directory_path>"
|
6 |
+
exit 1
|
7 |
+
fi
|
8 |
+
|
9 |
+
# Get the directory path from the argument
|
10 |
+
dir_path=$1
|
11 |
+
|
12 |
+
# Check if the specified directory exists
|
13 |
+
if [ ! -d "$dir_path" ]; then
|
14 |
+
echo "Directory does not exist: $dir_path"
|
15 |
+
exit 1
|
16 |
+
fi
|
17 |
+
|
18 |
+
# Iterate through all .docx files in the specified directory
|
19 |
+
for docx_file in "$dir_path"/*.docx; do
|
20 |
+
# Skip if no .docx files are found
|
21 |
+
if [ ! -f "$docx_file" ]; then
|
22 |
+
continue
|
23 |
+
fi
|
24 |
+
|
25 |
+
# Extract filename without extension
|
26 |
+
filename=$(basename -- "$docx_file")
|
27 |
+
filename="${filename%.*}"
|
28 |
+
|
29 |
+
# Define the output Markdown filename
|
30 |
+
md_file="${dir_path}/${filename}.md"
|
31 |
+
|
32 |
+
# Convert the document to Markdown format
|
33 |
+
pandoc -t markdown --extract-media="$dir_path" "$docx_file" -o "$md_file"
|
34 |
+
echo "Converted: $docx_file to $md_file"
|
35 |
+
done
|
36 |
+
|
37 |
+
echo "Conversion complete."
|