MOHAMMED-N commited on
Commit
0de70e8
·
verified ·
1 Parent(s): 99e777b

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +111 -0
  2. chain_setup.py +50 -0
  3. embedding.py +13 -0
  4. runtime.txt +1 -0
  5. streamlit_app.py +58 -0
  6. vectorstore.py +32 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain_experimental.text_splitter import SemanticChunker
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain.memory import ConversationBufferMemory
9
+
10
+ # --- 1) إعداد الصفحة ---
11
+ st.title("💬 المحادثة التفاعلية - إدارة البيانات وحماية البيانات الشخصية")
12
+ local_file = "Policies001.pdf"
13
+
14
+ index_folder = "faiss_index"
15
+
16
+ # إضافة CSS مخصص لدعم النصوص من اليمين لليسار
17
+ st.markdown(
18
+ """
19
+ <style>
20
+ .rtl {
21
+ direction: rtl;
22
+ text-align: right;
23
+ }
24
+ </style>
25
+ """,
26
+ unsafe_allow_html=True
27
+ )
28
+
29
+ # --- 2) تحميل أو بناء قاعدة بيانات FAISS ---
30
+ embeddings = HuggingFaceEmbeddings(
31
+ model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
32
+ model_kwargs={"trust_remote_code": True}
33
+ )
34
+
35
+ if os.path.exists(index_folder):
36
+ # تحميل قاعدة البيانات إذا كانت موجودة
37
+ vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
38
+ else:
39
+ # تحميل PDF وتقسيم النصوص
40
+ loader = PyPDFLoader(local_file)
41
+ documents = loader.load()
42
+
43
+ text_splitter = SemanticChunker(
44
+ embeddings=embeddings,
45
+ breakpoint_threshold_type='percentile',
46
+ breakpoint_threshold_amount=90
47
+ )
48
+ chunked_docs = text_splitter.split_documents(documents)
49
+
50
+ # إنشاء قاعدة بيانات FAISS
51
+ vectorstore = FAISS.from_documents(chunked_docs, embeddings)
52
+ vectorstore.save_local(index_folder)
53
+
54
+ # --- 3) إعداد المسترجع ---
55
+ retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
56
+
57
+ # --- 4) إعداد نموذج النص ---
58
+ model_name = "CohereForAI/c4ai-command-r7b-arabic-02-2025" # اسم النموذج
59
+
60
+ # التأكد من وجود توكن Hugging Face
61
+ hf_token = os.getenv("HF_TOKEN")
62
+ if hf_token is None:
63
+ st.error("Hugging Face token not found. Please set the 'HF_TOKEN' environment variable.")
64
+ st.stop()
65
+
66
+ # تحميل النموذج والمحول
67
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
68
+ model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token)
69
+
70
+ # إعداد pipeline لتوليد النصوص
71
+ qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
72
+
73
+ # --- 5) إعداد الذاكرة ---
74
+ memory = ConversationBufferMemory(
75
+ memory_key="chat_history",
76
+ return_messages=True
77
+ )
78
+
79
+ # --- 6) إدارة رسائل المستخدم ---
80
+ if "messages" not in st.session_state:
81
+ st.session_state["messages"] = [
82
+ {"role": "assistant", "content": "👋 مرحبًا! اسألني أي شيء عن إدارة البيانات وحماية البيانات الشخصية!"}
83
+ ]
84
+
85
+ # عرض الرسائل الحالية
86
+ for msg in st.session_state["messages"]:
87
+ with st.chat_message(msg["role"]):
88
+ st.markdown(f'<div class="rtl">{msg["content"]}</div>', unsafe_allow_html=True)
89
+
90
+ # --- 7) إدخال المستخدم ---
91
+ user_input = st.chat_input("اكتب سؤالك هنا")
92
+
93
+ # --- 8) معالجة رسالة المستخدم ---
94
+ if user_input:
95
+ # عرض رسالة المستخدم
96
+ st.session_state["messages"].append({"role": "user", "content": user_input})
97
+ with st.chat_message("user"):
98
+ st.markdown(f'<div class="rtl">{user_input}</div>', unsafe_allow_html=True)
99
+
100
+ # استرجاع المستندات ذات الصلة
101
+ retrieved_docs = retriever.get_relevant_documents(user_input)
102
+ context = "\n".join([doc.page_content for doc in retrieved_docs])
103
+ full_input = f"السياق:\n{context}\n\nالسؤال:\n{user_input}"
104
+
105
+ # توليد الإجابة باستخدام النموذج
106
+ response = qa_pipeline(full_input, max_length=500, num_return_sequences=1)[0]["generated_text"]
107
+
108
+ # عرض الإجابة
109
+ st.session_state["messages"].append({"role": "assistant", "content": response})
110
+ with st.chat_message("assistant"):
111
+ st.markdown(f'<div class="rtl">{response}</div>', unsafe_allow_html=True)
chain_setup.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import hf_hub_download
3
+ from langchain.llms import LlamaCpp
4
+ from langchain.chains import ConversationalRetrievalChain
5
+ from langchain.memory import ConversationBufferMemory
6
+
7
+ def load_llm():
8
+ """
9
+ Downloads the Q4_K_M GGUF model from mobeidat's Hugging Face repository and loads it via llama-cpp.
10
+ """
11
+ # 1) Download the GGUF model from Hugging Face
12
+ model_file = hf_hub_download(
13
+ repo_id="bartowski/c4ai-command-r7b-12-2024-GGUF",
14
+ filename="c4ai-command-r7b-12-2024-Q4_K_M.gguf",
15
+ local_dir="./models",
16
+ local_dir_use_symlinks=False
17
+ )
18
+
19
+ # 2) Load the model with llama-cpp via LangChain’s LlamaCpp
20
+ llm = LlamaCpp(
21
+ model_path=model_file,
22
+ flash_attn=False,
23
+ n_ctx=2048, # or 4096 depending on your needs
24
+ n_batch=512, # or even 256 depending on your hardware
25
+ chat_format='chatml'
26
+ )
27
+
28
+ return llm
29
+
30
+ def build_conversational_chain(vectorstore):
31
+ """
32
+ Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM
33
+ and a ConversationBufferMemory for multi-turn Q&A.
34
+ """
35
+ llm = load_llm()
36
+
37
+ # We'll store chat history in memory so the chain can handle multi-turn conversations
38
+ memory = ConversationBufferMemory(
39
+ memory_key="chat_history",
40
+ return_messages=True
41
+ )
42
+
43
+ qa_chain = ConversationalRetrievalChain.from_llm(
44
+ llm=llm,
45
+ retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
46
+ memory=memory,
47
+ verbose=True
48
+ )
49
+
50
+ return qa_chain
embedding.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embedding.py
2
+
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+
5
+ def load_embeddings():
6
+ """
7
+ Returns a HuggingFaceEmbeddings instance.
8
+ """
9
+ embeddings = HuggingFaceEmbeddings(
10
+ model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
11
+ model_kwargs={"trust_remote_code": True}
12
+ )
13
+ return embeddings
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.12
streamlit_app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import streamlit as st
4
+ import os
5
+
6
+ # Local imports
7
+ from embedding import load_embeddings
8
+ from vectorstore import load_or_build_vectorstore
9
+ from chain_setup import build_conversational_chain
10
+
11
+ def main():
12
+ st.title("💬 المحادثة التفاعلية - ادارة البيانات و حماية البيانات الشخصية")
13
+
14
+ # Paths and constants
15
+ local_file = "Policies001.pdf"
16
+ index_folder = "faiss_index"
17
+
18
+ # Step 1: Load Embeddings
19
+ embeddings = load_embeddings()
20
+
21
+ # Step 2: Build or load VectorStore
22
+ vectorstore = load_or_build_vectorstore(local_file, index_folder, embeddings)
23
+
24
+ # Step 3: Build the Conversational Retrieval Chain
25
+ qa_chain = build_conversational_chain(vectorstore)
26
+
27
+ # Step 4: Session State for UI Chat
28
+ if "messages" not in st.session_state:
29
+ st.session_state["messages"] = [
30
+ {"role": "assistant", "content": "👋 مرحبًا! اسألني أي شيء عن إدارة البيانات وحماية البيانات الشخصية"}
31
+ ]
32
+
33
+ # Display existing messages
34
+ for msg in st.session_state["messages"]:
35
+ with st.chat_message(msg["role"]):
36
+ st.markdown(msg["content"])
37
+
38
+ # Step 5: Chat Input
39
+ user_input = st.chat_input("Type your question...")
40
+
41
+ # Step 6: Process user input
42
+ if user_input:
43
+ # a) Display user message
44
+ st.session_state["messages"].append({"role": "user", "content": user_input})
45
+ with st.chat_message("user"):
46
+ st.markdown(user_input)
47
+
48
+ # b) Run chain
49
+ response_dict = qa_chain({"question": user_input})
50
+ answer = response_dict["answer"]
51
+
52
+ # c) Display assistant response
53
+ st.session_state["messages"].append({"role": "assistant", "content": answer})
54
+ with st.chat_message("assistant"):
55
+ st.markdown(answer)
56
+
57
+ if __name__ == "__main__":
58
+ main()
vectorstore.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vectorstore.py
2
+
3
+ import os
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain_experimental.text_splitter import SemanticChunker
6
+ from langchain_community.vectorstores import FAISS
7
+
8
+ def load_or_build_vectorstore(local_file: str, index_folder: str, embeddings):
9
+ """
10
+ Loads a local FAISS index if it exists; otherwise,
11
+ builds a new index from the specified PDF file.
12
+ """
13
+ if os.path.exists(index_folder):
14
+ print("Loading existing FAISS index from disk...")
15
+ vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
16
+ else:
17
+ print("Building a new FAISS index...")
18
+ loader = PyPDFLoader(local_file)
19
+ documents = loader.load()
20
+
21
+ text_splitter = SemanticChunker(
22
+ embeddings=embeddings,
23
+ breakpoint_threshold_type='percentile',
24
+ breakpoint_threshold_amount=90
25
+ )
26
+ chunked_docs = text_splitter.split_documents(documents)
27
+ print(f"Document split into {len(chunked_docs)} chunks.")
28
+
29
+ vectorstore = FAISS.from_documents(chunked_docs, embeddings)
30
+ vectorstore.save_local(index_folder)
31
+
32
+ return vectorstore