titanhacker commited on
Commit
091596c
1 Parent(s): dfbe8af

Upload 35 files

Browse files
Files changed (36) hide show
  1. .gitattributes +5 -0
  2. configs/app_config.yml +46 -0
  3. data/docs/Internal Medicine, Getachew Tizazu, Tadesse Anteneh.pdf +3 -0
  4. data/docs/cancer_and_cure__a_critical_analysis.27.pdf +0 -0
  5. data/docs/medical_oncology_handbook_june_2020_edition.pdf +0 -0
  6. data/docs_2/Attention_Is_All_You_Need.pdf +3 -0
  7. data/docs_2/stories.pdf +0 -0
  8. data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/data_level0.bin +3 -0
  9. data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/header.bin +3 -0
  10. data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/index_metadata.pickle +3 -0
  11. data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/length.bin +3 -0
  12. data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/link_lists.bin +3 -0
  13. data/vectordb/processed/chroma/chroma.sqlite3 +3 -0
  14. data/vectordb1/processed/chroma/chroma.sqlite3 +3 -0
  15. data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/data_level0.bin +3 -0
  16. data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/header.bin +3 -0
  17. data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/index_metadata.pickle +3 -0
  18. data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/length.bin +3 -0
  19. data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/link_lists.bin +3 -0
  20. images/Gemma-logo.png +0 -0
  21. images/test.png +3 -0
  22. src/app.py +116 -0
  23. src/llm_serve.py +50 -0
  24. src/llm_service.py +58 -0
  25. src/reference_serve.py +76 -0
  26. src/upload_data_manually.py +35 -0
  27. src/utils/__pycache__/chatbot.cpython-39.pyc +0 -0
  28. src/utils/__pycache__/load_config.cpython-39.pyc +0 -0
  29. src/utils/__pycache__/prepare_vectordb.cpython-39.pyc +0 -0
  30. src/utils/__pycache__/ui_settings.cpython-39.pyc +0 -0
  31. src/utils/__pycache__/upload_file.cpython-39.pyc +0 -0
  32. src/utils/chatbot.py +171 -0
  33. src/utils/load_config.py +89 -0
  34. src/utils/prepare_vectordb.py +117 -0
  35. src/utils/ui_settings.py +35 -0
  36. src/utils/upload_file.py +39 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/docs_2/Attention_Is_All_You_Need.pdf filter=lfs diff=lfs merge=lfs -text
37
+ data/docs/Internal[[:space:]]Medicine,[[:space:]]Getachew[[:space:]]Tizazu,[[:space:]]Tadesse[[:space:]]Anteneh.pdf filter=lfs diff=lfs merge=lfs -text
38
+ data/vectordb/processed/chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
39
+ data/vectordb1/processed/chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
40
+ images/test.png filter=lfs diff=lfs merge=lfs -text
configs/app_config.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ directories:
2
+ data_directory: data/docs
3
+ data_directory_2: data/docs_2
4
+ persist_directory: data/vectordb/processed/chroma/
5
+ custom_persist_directory: data/vectordb/uploaded/chroma/
6
+
7
+ llm_config:
8
+ embedding_model: "NeuML/pubmedbert-base-embeddings"
9
+ llm_system_role_with_history: "Answer the question based on the given content without using on your own knowledge.
10
+ You will receive a prompt with the the following format:
11
+
12
+ # Chat history:\n
13
+ [user query, response]\n\n
14
+
15
+ # Retrieved content number:\n
16
+ Content\n\n
17
+ Source\n\n
18
+
19
+ # User question:\n
20
+ New question
21
+ "
22
+ llm_system_role_without_history: "In the following you recieve a prompt.
23
+ Answer it based on given content. Provide only the response, dont say 'Answer:'."
24
+ engine: "BioMistral/BioMistral-7B"
25
+ temperature: 0.1
26
+ device: "cuda"
27
+ max_new_tokens: 4096
28
+ do_sample: True
29
+ top_k: 10
30
+ top_p: 0.1
31
+ add_history: False
32
+ splitter_config:
33
+ chunk_size: 1500
34
+ chunk_overlap: 250
35
+
36
+ retrieval_config:
37
+ k: 2
38
+
39
+ serve:
40
+ port: 8000
41
+
42
+ memory:
43
+ number_of_q_a_pairs: 2
44
+
45
+
46
+
data/docs/Internal Medicine, Getachew Tizazu, Tadesse Anteneh.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0169c575e3a4a5e39326a18ad57f1c0f82d9e78d33d06fbb895d395394d75b0a
3
+ size 2035855
data/docs/cancer_and_cure__a_critical_analysis.27.pdf ADDED
Binary file (226 kB). View file
 
data/docs/medical_oncology_handbook_june_2020_edition.pdf ADDED
Binary file (818 kB). View file
 
data/docs_2/Attention_Is_All_You_Need.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7d72988fd8107d07f7d278bf0ba6621adb6ed47df74be4014fa4a01f03aff6a
3
+ size 2215244
data/docs_2/stories.pdf ADDED
Binary file (427 kB). View file
 
data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d17e4a33d5f6f07d497f04499ed371b0b9688fb5fd10be3295b8cd02c9bcd4e
3
+ size 3212000
data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b27fdd8437a04493bf8a7d1f9a2e0fe99a426f9dd1cd05fc39e36645b2f4892
3
+ size 100
data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaabaf8bd5deb2ce724216d3d22742448e6bb36ad6fd47a8b7546943ac7bc2d9
3
+ size 55974
data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd0e081293048e3304f94190eb77ea8a43b86ea1d325125dde693add12c583c0
3
+ size 4000
data/vectordb/processed/chroma/6987cbb9-f35b-4396-b0c0-a911c7c462df/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d369861227544a7d964868dd4119f8d689bf3bad375dfbde0801806d57931359
3
+ size 8624
data/vectordb/processed/chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b2e23cc1f35dd063d2cc3d566647382f22b7f130a53c30a82f65e7d57d22658
3
+ size 13520896
data/vectordb1/processed/chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5159675380044145b50f2c946dcb6015b30199c9475fc4d3763410a052770fca
3
+ size 14123008
data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d340a83df102dfb094b470e14917af51d9cc9b4d2899aac7e8d03f5281dae4af
3
+ size 4236000
data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a4f6ac52066e3cc07e4dbd3cf3ba996e47b2dc0f3f3b483d9667139b648464d
3
+ size 100
data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6791402d0c0c55745efcc92b7b2b686b854ca13405ce507c8019494d5be482cb
3
+ size 55974
data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4570a3076ed944f19171251810fe12aa927bf0befa0961c875f5838be34af681
3
+ size 4000
data/vectordb1/processed/chroma/d65c6700-15e7-4d1b-8a9c-16cefb1f4e1e/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15f5b49c9cc9564ead2b3490bfd89131af9469aec01ba17df0234f56400380aa
3
+ size 8624
images/Gemma-logo.png ADDED
images/test.png ADDED

Git LFS Details

  • SHA256: 774ba274afa2d6a67b1e071562a485fe2f48ba306141f78df28af357b7199c08
  • Pointer size: 132 Bytes
  • Size of remote file: 4.97 MB
src/app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils.upload_file import UploadFile
3
+ from utils.chatbot import ChatBot
4
+ from utils.ui_settings import UISettings
5
+ from utils.load_config import LoadConfig
6
+
7
+ APPCFG = LoadConfig()
8
+ # # Prepare the LLm and Tokenizer
9
+ # tokenizer = AutoTokenizer.from_pretrained(
10
+ # APPCFG.llm_engine, token=APPCFG.gemma_token, device=APPCFG.device)
11
+ # model = model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-7b-it",
12
+ # token=APPCFG.gemma_token,
13
+ # torch_dtype=torch.float16,
14
+ # device_map=APPCFG.device
15
+ # )
16
+ # app_pipeline = pipeline(
17
+ # "text-generation",
18
+ # model=model,
19
+ # tokenizer=tokenizer
20
+ # )
21
+ with gr.Blocks() as demo:
22
+ with gr.Tabs():
23
+ with gr.TabItem("Med-App"):
24
+ ##############
25
+ # First ROW:
26
+ ##############
27
+ with gr.Row() as row_one:
28
+ with gr.Column(visible=False) as reference_bar:
29
+ ref_output = gr.Markdown()
30
+
31
+ with gr.Column() as chatbot_output:
32
+ chatbot = gr.Chatbot(
33
+ [],
34
+ elem_id="chatbot",
35
+ bubble_full_width=False,
36
+ height=500,
37
+ avatar_images=(
38
+ ("images/test.png"), "images/Gemma-logo.png"),
39
+ # render=False
40
+ )
41
+ # **Adding like/dislike icons
42
+ chatbot.like(UISettings.feedback, None, None)
43
+ ##############
44
+ # SECOND ROW:
45
+ ##############
46
+ with gr.Row():
47
+ input_txt = gr.Textbox(
48
+ lines=4,
49
+ scale=8,
50
+ placeholder="Enter text and press enter, or upload PDF files",
51
+ container=False,
52
+ )
53
+
54
+ ##############
55
+ # Third ROW:
56
+ ##############
57
+ with gr.Row() as row_two:
58
+ text_submit_btn = gr.Button(value="Submit text")
59
+ sidebar_state = gr.State(False)
60
+ btn_toggle_sidebar = gr.Button(
61
+ value="References")
62
+ btn_toggle_sidebar.click(UISettings.toggle_sidebar, [sidebar_state], [
63
+ reference_bar, sidebar_state])
64
+ upload_btn = gr.UploadButton(
65
+ "📁 Upload PDF or doc files", file_types=[
66
+ '.pdf',
67
+ '.doc'
68
+ ],
69
+ file_count="multiple")
70
+ clear_button = gr.ClearButton([input_txt, chatbot])
71
+ rag_with_dropdown = gr.Dropdown(
72
+ label="RAG with", choices=["Preprocessed doc", "Upload doc: Process for RAG"], value="Preprocessed doc")
73
+ ##############
74
+ # Fourth ROW:
75
+ ##############
76
+ with gr.Row() as row_four:
77
+ temperature_bar = gr.Slider(minimum=0.1, maximum=1, value=0.1, step=0.1,
78
+ label="Temperature", info="Increasing the temperature will make the model answer more creatively.")
79
+ top_k = gr.Slider(minimum=0.0,
80
+ maximum=100.0,
81
+ step=1,
82
+ label="top_k",
83
+ value=50,
84
+ info="A lower value (e.g. 10) will result in more conservative answers.")
85
+ top_p = gr.Slider(minimum=0.0,
86
+ maximum=1.0,
87
+ step=0.01,
88
+ label="top_p",
89
+ value=0.95,
90
+ info=" Works together with top-k. lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.0)")
91
+
92
+ ##############
93
+ # Process:
94
+ ##############
95
+ file_msg = upload_btn.upload(fn=UploadFile.process_uploaded_files, inputs=[
96
+ upload_btn, chatbot, rag_with_dropdown], outputs=[input_txt, chatbot], queue=False)
97
+
98
+ txt_msg = input_txt.submit(fn=ChatBot.respond,
99
+ inputs=[chatbot, input_txt,
100
+ rag_with_dropdown, temperature_bar, top_k, top_p],
101
+ outputs=[input_txt,
102
+ chatbot, ref_output],
103
+ queue=False).then(lambda: gr.Textbox(interactive=True),
104
+ None, [input_txt], queue=False)
105
+
106
+ txt_msg = text_submit_btn.click(fn=ChatBot.respond,
107
+ inputs=[chatbot, input_txt,
108
+ rag_with_dropdown, temperature_bar, top_k, top_p],
109
+ outputs=[input_txt,
110
+ chatbot, ref_output],
111
+ queue=False).then(lambda: gr.Textbox(interactive=True),
112
+ None, [input_txt], queue=False)
113
+
114
+
115
+ if __name__ == "__main__":
116
+ demo.launch()
src/llm_serve.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from flask import Flask, request, jsonify
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+ from utils.load_config import LoadConfig
5
+
6
+ APPCFG = LoadConfig()
7
+
8
+ app = Flask(__name__)
9
+
10
+ # Load the LLM and Tokenizer
11
+ tokenizer = AutoTokenizer.from_pretrained(
12
+ APPCFG.llm_engine, token=APPCFG.gemma_token, device=APPCFG.device)
13
+ model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="BioMistral/BioMistral-7B",
14
+ token=APPCFG.gemma_token,
15
+ torch_dtype=torch.float16,
16
+ device_map=APPCFG.device
17
+ )
18
+ app_pipeline = pipeline(
19
+ "text-generation",
20
+ model=model,
21
+ tokenizer=tokenizer
22
+ )
23
+
24
+
25
+ @app.route("/generate_text", methods=["POST"])
26
+ def generate_Text():
27
+ data = request.json
28
+ prompt = data.get("prompt", "")
29
+ max_new_tokens = data.get("max_new_tokens", 1000)
30
+ do_sample = data.get("do_sample", True)
31
+ temperature = data.get("temperature", 0.1)
32
+ top_k = data.get("top_k", 50)
33
+ top_p = data.get("top_p", 0.95)
34
+
35
+ tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
36
+ prompt, tokenize=False, add_generation_prompt=True)
37
+ outputs = app_pipeline(
38
+ tokenized_prompt,
39
+ max_new_tokens=max_new_tokens,
40
+ do_sample=do_sample,
41
+ temperature=temperature,
42
+ top_k=top_k,
43
+ top_p=top_p
44
+ )
45
+
46
+ return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]})
47
+
48
+
49
+ if __name__ == "__main__":
50
+ app.run(debug=False, port=8888)
src/llm_service.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from flask import Flask, request, jsonify
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+ from utils.load_config import LoadConfig
5
+ APPCFG = LoadConfig()
6
+
7
+ app = Flask(__name__)
8
+
9
+ # Load the LLM and Tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained(
11
+ APPCFG.llm_engine, token=APPCFG.gemma_token, device=APPCFG.device)
12
+ model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="BioMistral/BioMistral-7B",
13
+ token=APPCFG.gemma_token,
14
+ torch_dtype=torch.float16,
15
+ device_map=APPCFG.device
16
+ )
17
+ app_pipeline = pipeline(
18
+ "text-generation",
19
+ model=model,
20
+ tokenizer=tokenizer
21
+ )
22
+
23
+ # Endpoint to generate text
24
+
25
+
26
+ @app.route("/generate_text", methods=["POST"])
27
+ def generate_text():
28
+ data = request.json
29
+ prompt = data.get("prompt", "")
30
+ max_new_tokens = data.get("max_new_tokens", 1000)
31
+ do_sample = data.get("do_sample", True)
32
+ temperature = data.get("temperature", 0.1)
33
+ top_k = data.get("top_k", 50)
34
+ top_p = data.get("top_p", 0.95)
35
+ tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
36
+ prompt, tokenize=False, add_generation_prompt=True)
37
+
38
+ # Generate text based on the prompt
39
+ response = app_pipeline(
40
+ tokenized_prompt,
41
+ max_new_tokens=max_new_tokens,
42
+ do_sample=do_sample,
43
+ temperature=temperature,
44
+ top_k=top_k,
45
+ top_p=top_p
46
+ )
47
+ print("==================")
48
+ print("top_k:", top_k, "top_p:", top_p, "temperature:",
49
+ temperature, "max_new_tokens:", max_new_tokens)
50
+ print("==================")
51
+ print(response[0]["generated_text"][len(tokenized_prompt):])
52
+ print("==================")
53
+
54
+ return jsonify({"response": response[0]["generated_text"][len(tokenized_prompt):]})
55
+
56
+
57
+ if __name__ == "__main__":
58
+ app.run(debug=False, port=8888)
src/reference_serve.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.server
2
+ import socketserver
3
+ import yaml
4
+ import os
5
+ from pyprojroot import here
6
+
7
+ with open(here("configs/app_config.yml")) as cfg:
8
+ app_config = yaml.load(cfg, Loader=yaml.FullLoader)
9
+
10
+ PORT = app_config["serve"]["port"]
11
+ DIRECTORY1 = app_config["directories"]["data_directory"]
12
+ DIRECTORY2 = app_config["directories"]["data_directory_2"]
13
+
14
+
15
+ class SingleDirectoryHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
16
+ """
17
+ Custom HTTP request handler that serves files from a single directory.
18
+
19
+ This class extends the SimpleHTTPRequestHandler and sets the serving directory to DIRECTORY1.
20
+ """
21
+
22
+ def __init__(self, *args, **kwargs):
23
+ """
24
+ Initialize the SingleDirectoryHTTPRequestHandler.
25
+
26
+ Parameters:
27
+ args: Additional positional arguments for the base class.
28
+ kwargs: Additional keyword arguments for the base class.
29
+ """
30
+ super().__init__(*args, directory=DIRECTORY1, **kwargs)
31
+
32
+
33
+ class MultiDirectoryHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
34
+ """
35
+ Custom HTTP request handler that serves files from multiple directories.
36
+
37
+ This class extends the SimpleHTTPRequestHandler and allows serving files from DIRECTORY1 and DIRECTORY2
38
+ based on the first directory component in the requested path.
39
+ """
40
+
41
+ def translate_path(self, path):
42
+ """
43
+ Translate the requested path to the actual file path.
44
+
45
+ Parameters:
46
+ path (str): The requested path.
47
+
48
+ Returns:
49
+ str: The translated file path.
50
+ """
51
+ # Split the path to get the first directory component
52
+ parts = path.split('/', 2)
53
+ if len(parts) > 1:
54
+ first_directory = parts[1]
55
+ # Check if the first directory matches any of your target directories
56
+ if first_directory == os.path.basename(DIRECTORY1):
57
+ path = os.path.join(DIRECTORY1, *parts[2:])
58
+
59
+ elif first_directory == os.path.basename(DIRECTORY2):
60
+ path = os.path.join(DIRECTORY2, *parts[2:])
61
+ else:
62
+ # If the first part of the path is not a directory, check both directories for the file
63
+ file_path1 = os.path.join(DIRECTORY1, first_directory)
64
+ file_path2 = os.path.join(DIRECTORY2, first_directory)
65
+ if os.path.isfile(file_path1):
66
+ return file_path1
67
+ elif os.path.isfile(file_path2):
68
+ return file_path2
69
+ # If there's no match, use the default directory
70
+ return super().translate_path(path)
71
+
72
+
73
+ if __name__ == "__main__":
74
+ with socketserver.TCPServer(("", PORT), MultiDirectoryHTTPRequestHandler) as httpd:
75
+ print(f"Serving at port {PORT}")
76
+ httpd.serve_forever()
src/upload_data_manually.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from utils.prepare_vectordb import PrepareVectorDB
4
+ from utils.load_config import LoadConfig
5
+ CONFIG = LoadConfig()
6
+
7
+
8
+ def upload_data_manually() -> None:
9
+ """
10
+ Uploads data manually to the VectorDB.
11
+
12
+ This function initializes a PrepareVectorDB instance with configuration parameters
13
+ and chunk_overlap. It then checks if the VectorDB already exists in the specified
14
+ persist_directory. If not, it calls the prepare_and_save_vectordb method to
15
+ create and save the VectorDB. If the VectorDB already exists, a message is printed
16
+ indicating its presence.
17
+
18
+ Returns:
19
+ None
20
+ """
21
+ prepare_vectordb_instance = PrepareVectorDB(
22
+ data_directory=CONFIG.data_directory,
23
+ persist_directory=CONFIG.persist_directory,
24
+ chunk_size=CONFIG.chunk_size,
25
+ chunk_overlap=CONFIG.chunk_overlap,
26
+ )
27
+ if not len(os.listdir(CONFIG.persist_directory)) != 0:
28
+ prepare_vectordb_instance.prepare_and_save_vectordb()
29
+ else:
30
+ print(f"VectorDB already exists in {CONFIG.persist_directory}")
31
+ return None
32
+
33
+
34
+ if __name__ == "__main__":
35
+ upload_data_manually()
src/utils/__pycache__/chatbot.cpython-39.pyc ADDED
Binary file (5.05 kB). View file
 
src/utils/__pycache__/load_config.cpython-39.pyc ADDED
Binary file (3.17 kB). View file
 
src/utils/__pycache__/prepare_vectordb.cpython-39.pyc ADDED
Binary file (4.12 kB). View file
 
src/utils/__pycache__/ui_settings.cpython-39.pyc ADDED
Binary file (1.4 kB). View file
 
src/utils/__pycache__/upload_file.cpython-39.pyc ADDED
Binary file (1.75 kB). View file
 
src/utils/chatbot.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ import os
4
+ from langchain.vectorstores import Chroma
5
+ from typing import List, Tuple
6
+ import re
7
+ import ast
8
+ import html
9
+ from utils.load_config import LoadConfig
10
+ from langchain.embeddings import HuggingFaceEmbeddings
11
+ import requests
12
+ import torch
13
+ FLASK_APP_ENDPOINT = "http://127.0.0.1:8888/generate_text"
14
+
15
+ APPCFG = LoadConfig()
16
+ URL = ""
17
+ hyperlink = f"[RAG]({URL})"
18
+
19
+
20
+ class ChatBot:
21
+ """
22
+ Class representing a chatbot with document retrieval and response generation capabilities.
23
+
24
+ This class provides static methods for responding to user queries, handling feedback, and
25
+ cleaning references from retrieved documents.
26
+ """
27
+ @staticmethod
28
+ def respond(chatbot: List,
29
+ message: str,
30
+ data_type: str = "Preprocessed doc",
31
+ temperature: float = 0.1,
32
+ top_k: int = 10,
33
+ top_p: float = 0.1) -> Tuple:
34
+ """
35
+ Generate a response to a user query using document retrieval and language model completion.
36
+
37
+ Parameters:
38
+ chatbot (List): List representing the chatbot's conversation history.
39
+ message (str): The user's query.
40
+ data_type (str): Type of data used for document retrieval ("Preprocessed doc" or "Upload doc: Process for RAG").
41
+ temperature (float): Temperature parameter for language model completion.
42
+
43
+ Returns:
44
+ Tuple: A tuple containing an empty string, the updated chat history, and references from retrieved documents.
45
+ """
46
+
47
+ # Retrieve embedding function from code env resources
48
+ # emb_model = "sentence-transformers/all-MiniLM-L6-v2"
49
+ embedding_function = HuggingFaceEmbeddings(
50
+ model_name="NeuML/pubmedbert-base-embeddings",
51
+ # cache_folder=os.getenv('SENTENCE_TRANSFORMERS_HOME')
52
+ )
53
+ if data_type == "Preprocessed doc":
54
+ # directories
55
+ if os.path.exists(APPCFG.persist_directory):
56
+ vectordb = Chroma(persist_directory=APPCFG.persist_directory,
57
+ embedding_function=embedding_function)
58
+ else:
59
+ chatbot.append(
60
+ (message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module. For further information please visit {hyperlink}."))
61
+ return "", chatbot, None
62
+
63
+ elif data_type == "Upload doc: Process for RAG":
64
+ if os.path.exists(APPCFG.custom_persist_directory):
65
+ vectordb = Chroma(persist_directory=APPCFG.custom_persist_directory,
66
+ embedding_function=embedding_function)
67
+ else:
68
+ chatbot.append(
69
+ (message, f"No file was uploaded. Please first upload your files using the 'upload' button."))
70
+ return "", chatbot, None
71
+
72
+ docs = vectordb.similarity_search(message, k=APPCFG.k)
73
+ question = "# Prompt that you have to answer:\n" + message
74
+ retrieved_content, markdown_documents = ChatBot.clean_references(docs)
75
+ # Memory: previous two Q&A pairs
76
+ chat_history = f"Chat history:\n {str(chatbot[-APPCFG.number_of_q_a_pairs:])}\n\n"
77
+ if APPCFG.add_history:
78
+ prompt_wrapper = f"{APPCFG.llm_system_role_with_history}\n\n{chat_history}\n\n{retrieved_content}{question}"
79
+ else:
80
+ prompt_wrapper = f"{APPCFG.llm_system_role_without_history}\n\n{question}\n\n{retrieved_content}"
81
+
82
+ print("========================")
83
+ print(prompt_wrapper)
84
+ print("========================")
85
+ messages = [
86
+ {"role": "user", "content": prompt_wrapper},
87
+ ]
88
+ data = {
89
+ "prompt": messages,
90
+ "max_new_tokens": APPCFG.max_new_tokens,
91
+ "do_sample": APPCFG.do_sample,
92
+ "temperature": temperature,
93
+ "top_k": top_k,
94
+ "top_p": top_p
95
+ }
96
+ response = requests.post(FLASK_APP_ENDPOINT, json=data)
97
+ # print(response.text)
98
+ response_json = response.json()
99
+
100
+ chatbot.append(
101
+ (message, response_json["response"]))
102
+ # Clean up GPU memory
103
+ del vectordb
104
+ del docs
105
+ torch.cuda.empty_cache()
106
+ return "", chatbot, markdown_documents
107
+
108
+ @staticmethod
109
+ def clean_references(documents: List) -> str:
110
+ """
111
+ Clean and format references from retrieved documents.
112
+
113
+ Parameters:
114
+ documents (List): List of retrieved documents.
115
+
116
+ Returns:
117
+ str: A string containing cleaned and formatted references.
118
+ """
119
+ server_url = "http://localhost:8000"
120
+ documents = [str(x)+"\n\n" for x in documents]
121
+ markdown_documents = ""
122
+ retrieved_content = ""
123
+ counter = 1
124
+ for doc in documents:
125
+ # Extract content and metadata
126
+ content, metadata = re.match(
127
+ r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
128
+ metadata = metadata.split('=', 1)[1]
129
+ metadata_dict = ast.literal_eval(metadata)
130
+
131
+ # Decode newlines and other escape sequences
132
+ content = bytes(content, "utf-8").decode("unicode_escape")
133
+
134
+ # Replace escaped newlines with actual newlines
135
+ content = re.sub(r'\\n', '\n', content)
136
+ content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
137
+ content = re.sub(r'\s+', ' ', content).strip()
138
+
139
+
140
+ # Decode HTML entities
141
+ content = html.unescape(content)
142
+
143
+ # Replace incorrect unicode characters with correct ones
144
+ #content = content.encode('utf-8').decode('utf-8', 'ignore')
145
+ # Use UTF-8 encoding instead of latin-1 to avoid encoding issues
146
+ content = content.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
147
+
148
+
149
+
150
+ # Remove or replace special characters and mathematical symbols
151
+ # This step may need to be customized based on the specific symbols in your documents
152
+ content = re.sub(r'–', '-', content)
153
+ content = re.sub(r'∈', '∈', content)
154
+ content = re.sub(r'×', '×', content)
155
+ content = re.sub(r'fi', 'fi', content)
156
+ content = re.sub(r'∈', '∈', content)
157
+ content = re.sub(r'·', '·', content)
158
+ content = re.sub(r'fl', 'fl', content)
159
+
160
+ pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"
161
+ retrieved_content += f"# Content {counter}:\n" + \
162
+ content + "\n\n"
163
+
164
+ # Append cleaned content to the markdown string with two newlines between documents
165
+ markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
166
+ f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
167
+ f"Page number: {str(metadata_dict['page'])}" + " | " +\
168
+ f"[View PDF]({pdf_url})" "\n\n"
169
+ counter += 1
170
+
171
+ return retrieved_content, markdown_documents
src/utils/load_config.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from dotenv import load_dotenv
4
+ import yaml
5
+ from pyprojroot import here
6
+ import shutil
7
+
8
+ load_dotenv()
9
+
10
+
11
+ class LoadConfig:
12
+ """
13
+ A class for loading configuration settings and managing directories.
14
+
15
+ This class loads various configuration settings from the 'app_config.yml' file,
16
+ including language model (LLM) configurations, retrieval configurations, summarizer
17
+ configurations, and memory configurations. It also sets up OpenAI API credentials
18
+ and performs directory-related operations such as creating and removing directories.
19
+ """
20
+
21
+ def __init__(self) -> None:
22
+ with open(here("configs/app_config.yml")) as cfg:
23
+ app_config = yaml.load(cfg, Loader=yaml.FullLoader)
24
+
25
+ # LLM configs
26
+ self.llm_engine = app_config["llm_config"]["engine"]
27
+ self.llm_system_role_with_history = app_config["llm_config"]["llm_system_role_with_history"]
28
+ self.llm_system_role_without_history = app_config[
29
+ "llm_config"]["llm_system_role_without_history"]
30
+ self.persist_directory = str(here(
31
+ app_config["directories"]["persist_directory"])) # needs to be strin for summation in chromadb backend: self._settings.require("persist_directory") + "/chroma.sqlite3"
32
+ self.custom_persist_directory = str(here(
33
+ app_config["directories"]["custom_persist_directory"]))
34
+ self.gemma_token = os.getenv("GEMMA_TOKEN")
35
+ self.device = app_config["llm_config"]["device"]
36
+ # Retrieval configs
37
+ self.data_directory = app_config["directories"]["data_directory"]
38
+ self.k = app_config["retrieval_config"]["k"]
39
+ self.chunk_size = int(app_config["splitter_config"]["chunk_size"])
40
+ self.chunk_overlap = int(
41
+ app_config["splitter_config"]["chunk_overlap"])
42
+ self.temperature = float(app_config["llm_config"]["temperature"])
43
+ self.add_history = bool(app_config["llm_config"]["add_history"])
44
+ self.top_k = int(app_config["llm_config"]["top_k"])
45
+ self.top_p = float(app_config["llm_config"]["top_p"])
46
+ self.max_new_tokens = int(app_config["llm_config"]["max_new_tokens"])
47
+ self.do_sample = bool(app_config["llm_config"]["do_sample"])
48
+ self.embedding_model = app_config["llm_config"]["embedding_model"]
49
+
50
+ # Memory
51
+ self.number_of_q_a_pairs = int(
52
+ app_config["memory"]["number_of_q_a_pairs"])
53
+
54
+ # clean up the upload doc vectordb if it exists
55
+ self.create_directory(self.persist_directory)
56
+ self.remove_directory(self.custom_persist_directory)
57
+
58
+ def create_directory(self, directory_path: str):
59
+ """
60
+ Create a directory if it does not exist.
61
+
62
+ Parameters:
63
+ directory_path (str): The path of the directory to be created.
64
+ """
65
+ if not os.path.exists(directory_path):
66
+ os.makedirs(directory_path)
67
+
68
+ def remove_directory(self, directory_path: str):
69
+ """
70
+ Removes the specified directory.
71
+
72
+ Parameters:
73
+ directory_path (str): The path of the directory to be removed.
74
+
75
+ Raises:
76
+ OSError: If an error occurs during the directory removal process.
77
+
78
+ Returns:
79
+ None
80
+ """
81
+ if os.path.exists(directory_path):
82
+ try:
83
+ shutil.rmtree(directory_path)
84
+ print(
85
+ f"The directory '{directory_path}' has been successfully removed.")
86
+ except OSError as e:
87
+ print(f"Error: {e}")
88
+ else:
89
+ print(f"The directory '{directory_path}' does not exist.")
src/utils/prepare_vectordb.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import Chroma
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ import os
5
+ from typing import List
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+
8
+
9
+ class PrepareVectorDB:
10
+ """
11
+ A class for preparing and saving a VectorDB using OpenAI embeddings.
12
+
13
+ This class facilitates the process of loading documents, chunking them, and creating a VectorDB
14
+ with OpenAI embeddings. It provides methods to prepare and save the VectorDB.
15
+
16
+ Parameters:
17
+ data_directory (str or List[str]): The directory or list of directories containing the documents.
18
+ persist_directory (str): The directory to save the VectorDB.
19
+ chunk_size (int): The size of the chunks for document processing.
20
+ chunk_overlap (int): The overlap between chunks.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ data_directory: str,
26
+ persist_directory: str,
27
+ chunk_size: int,
28
+ chunk_overlap: int
29
+ ) -> None:
30
+ """
31
+ Initialize the PrepareVectorDB instance.
32
+
33
+ Parameters:
34
+ data_directory (str or List[str]): The directory or list of directories containing the documents.
35
+ persist_directory (str): The directory to save the VectorDB.
36
+ chunk_size (int): The size of the chunks for document processing.
37
+ chunk_overlap (int): The overlap between chunks.
38
+
39
+ """
40
+
41
+ self.text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size=chunk_size,
43
+ chunk_overlap=chunk_overlap,
44
+ separators=["\n\n", "\n", " ", ""]
45
+ )
46
+ """Other options: CharacterTextSplitter, TokenTextSplitter, etc."""
47
+ self.data_directory = data_directory
48
+ self.persist_directory = persist_directory
49
+ self.embedding_function = HuggingFaceEmbeddings(
50
+ model_name="NeuML/pubmedbert-base-embeddings",
51
+ # cache_folder=os.getenv('SENTENCE_TRANSFORMERS_HOME')
52
+ )
53
+
54
+ def __load_all_documents(self) -> List:
55
+ """
56
+ Load all documents from the specified directory or directories.
57
+
58
+ Returns:
59
+ List: A list of loaded documents.
60
+ """
61
+ doc_counter = 0
62
+ if isinstance(self.data_directory, list):
63
+ print("Loading the uploaded documents...")
64
+ docs = []
65
+ for doc_dir in self.data_directory:
66
+ docs.extend(PyPDFLoader(doc_dir).load())
67
+ doc_counter += 1
68
+ print("Number of loaded documents:", doc_counter)
69
+ print("Number of pages:", len(docs), "\n\n")
70
+ else:
71
+ print("Loading documents manually...")
72
+ document_list = os.listdir(self.data_directory)
73
+ docs = []
74
+ for doc_name in document_list:
75
+ docs.extend(PyPDFLoader(os.path.join(
76
+ self.data_directory, doc_name)).load())
77
+ doc_counter += 1
78
+ print("Number of loaded documents:", doc_counter)
79
+ print("Number of pages:", len(docs), "\n\n")
80
+
81
+ return docs
82
+
83
+ def __chunk_documents(self, docs: List) -> List:
84
+ """
85
+ Chunk the loaded documents using the specified text splitter.
86
+
87
+ Parameters:
88
+ docs (List): The list of loaded documents.
89
+
90
+ Returns:
91
+ List: A list of chunked documents.
92
+
93
+ """
94
+ print("Chunking documents...")
95
+ chunked_documents = self.text_splitter.split_documents(docs)
96
+ print("Number of chunks:", len(chunked_documents), "\n\n")
97
+ return chunked_documents
98
+
99
+ def prepare_and_save_vectordb(self):
100
+ """
101
+ Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
102
+
103
+ Returns:
104
+ Chroma: The created VectorDB.
105
+ """
106
+ docs = self.__load_all_documents()
107
+ chunked_documents = self.__chunk_documents(docs)
108
+ print("Preparing vectordb...")
109
+ vectordb = Chroma.from_documents(
110
+ documents=chunked_documents,
111
+ embedding=self.embedding_function,
112
+ persist_directory=self.persist_directory
113
+ )
114
+ print("VectorDB is created and saved.")
115
+ print("Number of vectors in vectordb:",
116
+ vectordb._collection.count(), "\n\n")
117
+ return vectordb
src/utils/ui_settings.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ class UISettings:
5
+ """
6
+ Utility class for managing UI settings.
7
+
8
+ This class provides static methods for toggling UI components, such as a sidebar.
9
+ """
10
+ @staticmethod
11
+ def toggle_sidebar(state):
12
+ """
13
+ Toggle the visibility state of a UI component.
14
+
15
+ Parameters:
16
+ state: The current state of the UI component.
17
+
18
+ Returns:
19
+ Tuple: A tuple containing the updated UI component state and the new state.
20
+ """
21
+ state = not state
22
+ return gr.update(visible=state), state
23
+
24
+ @staticmethod
25
+ def feedback(data: gr.LikeData):
26
+ """
27
+ Process user feedback on the generated response.
28
+
29
+ Parameters:
30
+ data (gr.LikeData): Gradio LikeData object containing user feedback.
31
+ """
32
+ if data.liked:
33
+ print("You upvoted this response: " + data.value)
34
+ else:
35
+ print("You downvoted this response: " + data.value)
src/utils/upload_file.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.prepare_vectordb import PrepareVectorDB
2
+ from typing import List, Tuple
3
+ from utils.load_config import LoadConfig
4
+
5
+ APPCFG = LoadConfig()
6
+
7
+
8
+ class UploadFile:
9
+ """
10
+ Utility class for handling file uploads and processing.
11
+
12
+ This class provides static methods for checking directories and processing uploaded files
13
+ to prepare a VectorDB.
14
+ """
15
+
16
+ @staticmethod
17
+ def process_uploaded_files(files_dir: List, chatbot: List, rag_with_dropdown: str) -> Tuple:
18
+ """
19
+ Process uploaded files to prepare a VectorDB.
20
+
21
+ Parameters:
22
+ files_dir (List): List of paths to the uploaded files.
23
+ chatbot: An instance of the chatbot for communication.
24
+
25
+ Returns:
26
+ Tuple: A tuple containing an empty string and the updated chatbot instance.
27
+ """
28
+ if rag_with_dropdown == "Upload doc: Process for RAG":
29
+ prepare_vectordb_instance = PrepareVectorDB(data_directory=files_dir,
30
+ persist_directory=APPCFG.custom_persist_directory,
31
+ chunk_size=APPCFG.chunk_size,
32
+ chunk_overlap=APPCFG.chunk_overlap)
33
+ prepare_vectordb_instance.prepare_and_save_vectordb()
34
+ chatbot.append(
35
+ (" ", "Uploaded files are ready. Please ask your question"))
36
+ else:
37
+ chatbot.append(
38
+ (" ", "If you would like to upload a PDF, please select your desired action in 'rag_with' dropdown."))
39
+ return "", chatbot