ishaan-mital commited on
Commit
4dd0f5b
1 Parent(s): f585ede

added embedding model and vector DB

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +115 -4
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py CHANGED
@@ -1,7 +1,118 @@
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
  import gradio as gr
3
+ import os
4
+ import pinecone
5
+ import time
6
+ from torch import cuda
7
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
8
+ import PyPDF2
9
+ import re
10
+ from langchain.vectorstores import Pinecone
11
+ import os
12
 
13
+ embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
14
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
15
 
16
+ embed_model = HuggingFaceEmbeddings(
17
+ model_name=embed_model_id,
18
+ model_kwargs={'device': device},
19
+ encode_kwargs={'device': device, 'batch_size': 32}
20
+ )
21
+
22
+ # get API key from app.pinecone.io and environment from console
23
+ pinecone.init(
24
+ api_key=os.environ.get('PINECONE_API_KEY'),
25
+ environment=os.environ.get('PINECONE_ENVIRONMENT')
26
+ )
27
+ docs = [
28
+ "this is one document",
29
+ "and another document"
30
+ ]
31
+
32
+ embeddings = embed_model.embed_documents(docs)
33
+
34
+ index_name = 'llama-rag'
35
+
36
+ if index_name not in pinecone.list_indexes():
37
+ pinecone.create_index(
38
+ index_name,
39
+ dimension=len(embeddings[0]),
40
+ metric='cosine'
41
+ )
42
+ # wait for index to finish initialization
43
+ while not pinecone.describe_index(index_name).status['ready']:
44
+ time.sleep(1)
45
+
46
+ index = pinecone.Index(index_name)
47
+ index.describe_index_stats()
48
+
49
+ # def extract_text_from_pdf(pdf_path):
50
+ # pdf_file = open(pdf_path, 'rb')
51
+ # pdf_reader = PyPDF2.PdfReader(pdf_file)
52
+
53
+ # text = ""
54
+ # for page_number in range(len(pdf_reader.pages)):
55
+ # page = pdf_reader.pages[page_number]
56
+ # text += page.extract_text()
57
+
58
+ # pdf_file.close()
59
+ # return text
60
+
61
+ # def identify_sections(text):
62
+ # # Assuming sections start with "Chapter" headings
63
+ # sections = re.split(r'\n1+', text)
64
+ # sections = [section.strip() for section in sections if section.strip()]
65
+ # return sections
66
+
67
+ # pdf_files = ['leph101.pdf', 'leph102.pdf','leph103.pdf','leph104.pdf','leph105.pdf','leph106.pdf','leph107.pdf','leph108.pdf'] # Add more file names as needed
68
+
69
+ # book_sections=[]
70
+ # for pdf_file in pdf_files:
71
+ # pdf_path = f'/content/{pdf_file}'
72
+ # book_text = extract_text_from_pdf(pdf_path)
73
+ # book_sections.append(identify_sections(book_text))
74
+ # print(len(book_sections))
75
+ # # Now you can organize and store the data as needed
76
+
77
+ # import pandas as pd
78
+
79
+ # data = pd.DataFrame({
80
+ # 'ID': range(len(book_sections)), # Sequential IDs
81
+ # 'Text': book_sections
82
+ # })
83
+ # print(data)
84
+
85
+ # batch_size = 4
86
+
87
+ # for i in range(0, len(data), batch_size):
88
+ # i_end = min(len(data), i+batch_size)
89
+ # batch = data.iloc[i:i_end]
90
+ # ids = [f"{x['ID']}" for i, x in batch.iterrows()]
91
+ # texts = [x['Text'] for i, x in batch.iterrows()]
92
+ # embeds = embed_model.embed_documents(texts)
93
+ # # get metadata to store in Pinecone
94
+ # metadata = [
95
+ # {'text': x['Text'],
96
+ # 'ID': x['ID']} for i, x in batch.iterrows()
97
+ # ]
98
+ # # add to Pinecone
99
+ # index.upsert(vectors=zip(ids, embeds,metadata))
100
+
101
+ text_field = 'text' # field in metadata that contains text content
102
+
103
+ vectorstore = Pinecone(
104
+ index, embed_model.embed_query, text_field
105
+ )
106
+
107
+
108
+ def question(query):
109
+ return vectorstore.similarity_search(
110
+ query, # the search query
111
+ k=3 # returns top 3 most relevant chunks of text
112
+ )
113
+
114
+
115
+ demo = gr.Interface(fn=question, inputs="text", outputs="text")
116
+
117
+ if __name__ == "__main__":
118
+ demo.launch()