EmileH commited on
Commit
4ddb7d1
1 Parent(s): 9d51db9

Adding base app and knowledge text file

Browse files
Files changed (2) hide show
  1. app.py +183 -0
  2. knowledge-plain.txt +0 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.components.generators import OpenAIGenerator
2
+ from haystack.utils import Secret
3
+ from haystack.components.builders.prompt_builder import PromptBuilder
4
+ from haystack.components.routers import ConditionalRouter
5
+ from haystack_integrations.document_stores.chroma import ChromaDocumentStore
6
+ from haystack import Pipeline
7
+ from haystack.components.writers import DocumentWriter
8
+ from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
9
+ from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
10
+ from haystack.components.preprocessors import DocumentSplitter
11
+ from haystack.components.converters.txt import TextFileToDocument
12
+ from haystack.components.preprocessors import DocumentCleaner
13
+
14
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
15
+ from haystack.components.retrievers import InMemoryEmbeddingRetriever
16
+
17
+ import gradio as gr
18
+
19
+ embedding_model = "dunzhang/stella_en_400M_v5"
20
+
21
+
22
+ ########################
23
+ ####### Indexing #######
24
+ ########################
25
+
26
+ # In memory version for now
27
+ document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
28
+
29
+ converter = TextFileToDocument()
30
+
31
+ cleaner = DocumentCleaner()
32
+
33
+ splitter = DocumentSplitter(split_by="word", split_length=200, split_overlap=100)
34
+
35
+ embedder = SentenceTransformersDocumentEmbedder(model=embedding_model,
36
+ trust_remote_code=True)
37
+
38
+ writer = DocumentWriter(document_store=document_store)
39
+
40
+ indexing = Pipeline()
41
+
42
+ indexing.add_component("converter", converter)
43
+ indexing.add_component("cleaner", cleaner)
44
+ indexing.add_component("splitter", splitter)
45
+ indexing.add_component("embedder", embedder)
46
+ indexing.add_component("writer", writer)
47
+
48
+ indexing.connect("converter", "cleaner")
49
+ indexing.connect("cleaner", "splitter")
50
+ indexing.connect("splitter", "embedder")
51
+ indexing.connect("embedder", "writer")
52
+
53
+ indexing.run({"sources": ["knowledge-plain.txt"]})
54
+
55
+
56
+ ##################################
57
+ ####### Answering pipeline #######
58
+ ##################################
59
+
60
+ no_answer_message = (
61
+ "I'm not allowed to answer this questions. Please ask something related to "
62
+ "APIs access in accordance DSA’s transparency and data-sharing provisions. "
63
+ "Is there anything else I can do for you? "
64
+ )
65
+
66
+ relevance_prompt_template = """
67
+ Classify whether this user is asking for something related to social media APIs,
68
+ the Digital Services Act (DSA), or any topic related to online platforms’ compliance
69
+ with legal and data-sharing frameworks.
70
+
71
+ Relevant topics include social media API access, data transparency, compliance
72
+ with DSA provisions, and online platform regulations.
73
+
74
+ Here is their message:
75
+
76
+ {{query}}
77
+
78
+ Here are the two previous messages. ONLY refer to these if the above message refers previous ones.
79
+
80
+ {% for message in user_history[-2:] %}
81
+ * {{message["content"]}}
82
+
83
+ {% endfor %}
84
+
85
+ If the request is related to these topics, respond “YES”. If it is off-topic (e.g., unrelated to APIs, the DSA, or legal frameworks), respond “NO”."""
86
+
87
+ routes = [
88
+ {
89
+ "condition": "{{'YES' in replies[0]}}",
90
+ "output": "{{query}}",
91
+ "output_name": "query",
92
+ "output_type": str,
93
+ },
94
+ {
95
+ "condition": "{{'NO' in replies[0]}}",
96
+ "output": no_answer_message,
97
+ "output_name": "no_answer",
98
+ "output_type": str,
99
+ }
100
+ ]
101
+
102
+ query_prompt_template = """Conversation history:
103
+ {{conv_history}}
104
+
105
+ Here is what the user has requested:
106
+
107
+ {{query}}
108
+
109
+ Reply to the question with a short paragraph according to the following documents:
110
+
111
+ {% for document in documents %}
112
+ * {{document.content}}
113
+
114
+ {% endfor %}
115
+
116
+ Do not mention the documents in your answer, present it as your own knowledge.
117
+ """
118
+
119
+ prompt_builder = PromptBuilder(template=relevance_prompt_template)
120
+
121
+ llm = OpenAIGenerator(
122
+ api_key=Secret.from_env_var("OPENAI_API_KEY"),
123
+ model="gpt-4o-mini",
124
+ generation_kwargs = {"max_tokens": 8192}
125
+ )
126
+
127
+ router = ConditionalRouter(routes=routes)
128
+
129
+ embedder = SentenceTransformersTextEmbedder(model=embedding_model)
130
+
131
+ # Again: in memory for now
132
+ retriever = InMemoryEmbeddingRetriever(document_store)
133
+
134
+ prompt_builder2 = PromptBuilder(template=query_prompt_template)
135
+
136
+ llm2 = OpenAIGenerator(
137
+ api_key=Secret.from_env_var("OPENAI_API_KEY"),
138
+ model="gpt-4o-mini",
139
+ generation_kwargs = {"max_tokens": 8192}
140
+ )
141
+
142
+ answer_query = Pipeline()
143
+
144
+ answer_query.add_component("prompt_builder", prompt_builder)
145
+ answer_query.add_component("llm", llm)
146
+ answer_query.add_component("router", router)
147
+ answer_query.add_component("embedder", embedder)
148
+ answer_query.add_component("retriever", retriever)
149
+ answer_query.add_component("prompt_builder2", prompt_builder2)
150
+ answer_query.add_component("llm2", llm2)
151
+
152
+ answer_query.connect("prompt_builder", "llm")
153
+ answer_query.connect("llm", "router")
154
+ answer_query.connect("router.query", "embedder")
155
+ answer_query.connect("embedder", "retriever")
156
+ answer_query.connect("retriever", "prompt_builder2")
157
+ answer_query.connect("prompt_builder2", "llm2")
158
+
159
+ answer_query.warm_up()
160
+
161
+
162
+ ##########################
163
+ ####### Gradio app #######
164
+ ##########################
165
+
166
+ def chat(message, history):
167
+ """
168
+ Chat function for Gradio. Uses the pipeline to produce next answer.
169
+ """
170
+ conv_history = "\n\n".join([f"{message["role"]}: {message["content"]}" for message in history[-2:]])
171
+ user_history = [message for message in history if message["role"] == "user"]
172
+ results = answer_query.run({"user_history": user_history, "query": message,
173
+ "conv_history": conv_history})
174
+ if "llm2" in results:
175
+ answer = results["llm2"]["replies"][0]
176
+ elif "router" in results and "no_answer" in results["router"]:
177
+ answer = results["router"]["no_answer"]
178
+ else:
179
+ answer = "Sorry, a mistake occured"
180
+ return answer
181
+
182
+ if __name__ == "__main__":
183
+ gr.ChatInterface(chat, type="messages").launch()
knowledge-plain.txt ADDED
The diff for this file is too large to render. See raw diff