LOUIS SANNA commited on
Commit
a7c1ef2
1 Parent(s): a5610c0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ chroma filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Datak
3
- emoji: 🔥
4
  colorFrom: indigo
5
  colorTo: pink
6
  sdk: gradio
 
1
  ---
2
  title: Datak
3
+ emoji: 🪲
4
  colorFrom: indigo
5
  colorTo: pink
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
2
+ from langchain.vectorstores import Chroma # for the vectorization part
3
+ from langchain.chains import ConversationalRetrievalChain
4
+ from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
5
+ import gradio as gr
6
+ from gradio import inputs, outputs
7
+ from gradio.mix import Parallel
8
+
9
+ max_sources = 4
10
+
11
+ embedding = OpenAIEmbeddings()
12
+ vectordb = Chroma(persist_directory="/chroma", embedding_function=embedding)
13
+ pdf_qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
14
+ vectordb.as_retriever(), return_source_documents=True)
15
+
16
+
17
+ def chat_pdf(query, chat_history=""):
18
+ result = pdf_qa({"question": query, "chat_history": chat_history})
19
+ answer = result["answer"]
20
+ source_docs = result["source_documents"]
21
+
22
+ print("source_docs", len(source_docs))
23
+
24
+ cleaned_docs = []
25
+ for doc in source_docs:
26
+ cleaned_content = doc.page_content
27
+ metadata_info = f"Metadata: {doc.metadata}\n"
28
+ cleaned_docs.append(metadata_info + cleaned_content)
29
+
30
+ # Pad the outputs to match the number of output components in the Gradio interface
31
+ padded_outputs = [answer] + cleaned_docs + [""] * (max_sources - len(cleaned_docs))
32
+ return padded_outputs
33
+ return [answer] + cleaned_docs
34
+
35
+
36
+ def create_outputs(num_sources):
37
+ outputs = [gr.outputs.Textbox(label="Answer")]
38
+
39
+ for i in range(1, num_sources + 1):
40
+ outputs.append(gr.outputs.Textbox(label=f"Source Document {i}"))
41
+
42
+ return outputs
43
+
44
+
45
+ iface = gr.Interface(
46
+ fn=chat_pdf,
47
+ inputs=[gr.inputs.Textbox(label="Query")],
48
+ outputs=create_outputs(max_sources),
49
+ layout="vertical",
50
+ examples=[
51
+ ["Give 2 species of fulgoroidea"],
52
+ ["What colors are found among fulgoroidea?"],
53
+ ["Why are fulgoroidea so cute?"]
54
+ # Add more example queries if desired
55
+ ],
56
+ css=".answer, .source_documents {width: 45%; float: left; margin-right: 20px;}"
57
+ )
58
+
59
+ iface.launch(debug=True)
chroma/chroma-collections.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9c6a5a3cfc099f13e2d6431a6b77129ba4279a75bd0b677ac883daaf98f17b
3
+ size 557
chroma/chroma-embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc05c6b9445f1cdb2f8c9258adc6b1c91c2438500c14054c24c343767f4d1761
3
+ size 44184
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ openai
2
+ langchain
3
+ chromadb
4
+ gradio