Spaces:
Runtime error
Runtime error
Jorge Henao
commited on
Commit
•
9735086
1
Parent(s):
3af52d7
pipeline refactor with custom pineconer retriever
Browse files- __pycache__/pinecode_quieries.cpython-38.pyc +0 -0
- app_pinecode.py +8 -0
- pinecode_quieries.py +83 -16
- requirements.txt +0 -1
__pycache__/pinecode_quieries.cpython-38.pyc
CHANGED
Binary files a/__pycache__/pinecode_quieries.cpython-38.pyc and b/__pycache__/pinecode_quieries.cpython-38.pyc differ
|
|
app_pinecode.py
CHANGED
@@ -4,6 +4,14 @@ import time
|
|
4 |
from config import Config
|
5 |
from pinecode_quieries import PinecodeProposalQueries
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
extractive_query = PinecodeProposalQueries (es_host = Config.es_host, es_index = Config.proposals_index,
|
8 |
es_user = Config.es_user, es_password = Config.es_password,
|
9 |
reader_name_or_path = Config.reader_model_name_or_path,
|
|
|
4 |
from config import Config
|
5 |
from pinecode_quieries import PinecodeProposalQueries
|
6 |
|
7 |
+
import logging
|
8 |
+
|
9 |
+
|
10 |
+
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
|
11 |
+
logging.getLogger("haystack").setLevel(logging.INFO)
|
12 |
+
|
13 |
+
logging.info("This is a test log ..")
|
14 |
+
|
15 |
extractive_query = PinecodeProposalQueries (es_host = Config.es_host, es_index = Config.proposals_index,
|
16 |
es_user = Config.es_user, es_password = Config.es_password,
|
17 |
reader_name_or_path = Config.reader_model_name_or_path,
|
pinecode_quieries.py
CHANGED
@@ -1,10 +1,34 @@
|
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from haystack.nodes import BM25Retriever, FARMReader
|
3 |
from haystack.document_stores import ElasticsearchDocumentStore
|
4 |
-
from haystack.pipelines import ExtractiveQAPipeline
|
5 |
from haystack.document_stores import PineconeDocumentStore
|
6 |
from haystack.nodes import EmbeddingRetriever
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import certifi
|
9 |
import datetime
|
10 |
import requests
|
@@ -12,6 +36,36 @@ from base64 import b64encode
|
|
12 |
|
13 |
ca_certs=certifi.where()
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
class DocumentQueries(ABC):
|
16 |
|
17 |
@abstractmethod
|
@@ -28,34 +82,47 @@ class PinecodeProposalQueries(DocumentQueries):
|
|
28 |
def _initialize_pipeline(self, es_host, es_index, es_user, es_password, reader = None):
|
29 |
if reader is not None:
|
30 |
self.reader = reader
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
34 |
self.document_store = PineconeDocumentStore(
|
35 |
api_key=es_password,
|
36 |
environment = "us-east1-gcp",
|
37 |
-
index=
|
38 |
similarity="cosine",
|
39 |
embedding_dim=384
|
40 |
-
)
|
41 |
-
#self.retriever = BM25Retriever(document_store = self.document_store)
|
42 |
-
self.retriever = EmbeddingRetriever(
|
43 |
-
document_store=self.document_store,
|
44 |
-
#embedding_model="multi-qa-distilbert-dot-v1",
|
45 |
-
embedding_model = "sentence-transformers/msmarco-MiniLM-L6-cos-v5",
|
46 |
-
model_format="sentence_transformers"
|
47 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
#self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
|
49 |
-
self.pipe = ExtractiveQAPipeline(self.reader, self.retriever)
|
|
|
50 |
|
51 |
def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None) :
|
52 |
#self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
|
53 |
|
54 |
-
#self.log.write_log(query, "hfspace-informecomision")
|
55 |
#if es_index is not None:
|
56 |
#self._initialize_pipeline(self.es_host, es_index, self.es_user, self.es_password)
|
57 |
-
|
58 |
-
params = {"Retriever": {"top_k": retriever_top_k}}
|
59 |
prediction = self.pipe.run( query = query, params = params)
|
60 |
return prediction["answers"]
|
61 |
|
|
|
1 |
+
|
2 |
from abc import ABC, abstractmethod
|
3 |
from haystack.nodes import BM25Retriever, FARMReader
|
4 |
from haystack.document_stores import ElasticsearchDocumentStore
|
5 |
+
from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline
|
6 |
from haystack.document_stores import PineconeDocumentStore
|
7 |
from haystack.nodes import EmbeddingRetriever
|
8 |
|
9 |
+
import json
|
10 |
+
import logging
|
11 |
+
import os
|
12 |
+
import shutil
|
13 |
+
import sys
|
14 |
+
import uuid
|
15 |
+
from json import JSONDecodeError
|
16 |
+
from pathlib import Path
|
17 |
+
from typing import List, Optional
|
18 |
+
|
19 |
+
import pandas as pd
|
20 |
+
import pinecone
|
21 |
+
import streamlit as st
|
22 |
+
|
23 |
+
from haystack import BaseComponent, Document
|
24 |
+
from haystack.document_stores import PineconeDocumentStore
|
25 |
+
from haystack.nodes import (
|
26 |
+
EmbeddingRetriever,
|
27 |
+
FARMReader
|
28 |
+
)
|
29 |
+
from haystack.pipelines import ExtractiveQAPipeline, Pipeline
|
30 |
+
from sentence_transformers import SentenceTransformer
|
31 |
+
|
32 |
import certifi
|
33 |
import datetime
|
34 |
import requests
|
|
|
36 |
|
37 |
ca_certs=certifi.where()
|
38 |
|
39 |
+
class PineconeRetriever(BaseComponent):
|
40 |
+
outgoing_edges = 1
|
41 |
+
|
42 |
+
def __init__(self, sentence_transformer_name: str, api_key:str, environment: str, index_name: str):
|
43 |
+
# a small subset of the component's parameters is sent in an event after applying filters defined in haystack.telemetry.NonPrivateParameters
|
44 |
+
self.sts_model = SentenceTransformer(sentence_transformer_name)
|
45 |
+
pinecone.init(api_key = api_key, environment=environment)
|
46 |
+
self.index = pinecone.Index(index_name)
|
47 |
+
|
48 |
+
def run(self, query: str, top_k: Optional[int]):
|
49 |
+
# process the inputs
|
50 |
+
vector_embeddings = self.sts_model.encode(query).tolist()
|
51 |
+
response = self.index.query([vector_embeddings], top_k=top_k, include_metadata=True)
|
52 |
+
docs = [
|
53 |
+
Document(
|
54 |
+
content=d["metadata"]['content'],
|
55 |
+
meta={'title': d["metadata"]['title'],
|
56 |
+
'page': d["metadata"]['page'],
|
57 |
+
'source': d["metadata"]['source']
|
58 |
+
}
|
59 |
+
)
|
60 |
+
for d in response["matches"]
|
61 |
+
]
|
62 |
+
output = {"documents": docs, "query": query}
|
63 |
+
return output, "output_1"
|
64 |
+
|
65 |
+
def run_batch(self, queries: List[str], top_k: Optional[int]):
|
66 |
+
return {}, "output_1"
|
67 |
+
|
68 |
+
|
69 |
class DocumentQueries(ABC):
|
70 |
|
71 |
@abstractmethod
|
|
|
82 |
def _initialize_pipeline(self, es_host, es_index, es_user, es_password, reader = None):
|
83 |
if reader is not None:
|
84 |
self.reader = reader
|
85 |
+
|
86 |
+
#pinecone.init(api_key=es_password, environment="us-east1-gcp")
|
87 |
+
index_name = "semantic-text-search"
|
88 |
+
|
89 |
self.document_store = PineconeDocumentStore(
|
90 |
api_key=es_password,
|
91 |
environment = "us-east1-gcp",
|
92 |
+
index=index_name,
|
93 |
similarity="cosine",
|
94 |
embedding_dim=384
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
)
|
96 |
+
self.pipe = Pipeline()
|
97 |
+
pinecone_retriever = PineconeRetriever("sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
98 |
+
es_password, "us-east1-gcp",
|
99 |
+
index_name)
|
100 |
+
self.pipe.add_node(component=pinecone_retriever, name="Retriever", inputs=["Query"])
|
101 |
+
self.pipe.add_node(component=self.reader, name="Reader", inputs=["Retriever"])
|
102 |
+
|
103 |
+
# #self.retriever = BM25Retriever(document_store = self.document_store)
|
104 |
+
# self.retriever = EmbeddingRetriever(
|
105 |
+
# document_store=self.document_store,
|
106 |
+
# #embedding_model="multi-qa-distilbert-dot-v1",
|
107 |
+
# embedding_model = "sentence-transformers/msmarco-MiniLM-L6-cos-v5",
|
108 |
+
# model_format="sentence_transformers"
|
109 |
+
# )
|
110 |
+
|
111 |
+
# retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
#self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
|
116 |
+
#self.pipe = ExtractiveQAPipeline (reader = self.reader, retriever = self.retriever)
|
117 |
+
#self.pipe = DocumentSearchPipeline(self.retriever)
|
118 |
|
119 |
def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None) :
|
120 |
#self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
|
121 |
|
|
|
122 |
#if es_index is not None:
|
123 |
#self._initialize_pipeline(self.es_host, es_index, self.es_user, self.es_password)
|
124 |
+
params = {"Retriever": {"top_k": retriever_top_k}, "Reader": {"top_k": reader_top_k}}
|
125 |
+
#params = {"Retriever": {"top_k": retriever_top_k}}
|
126 |
prediction = self.pipe.run( query = query, params = params)
|
127 |
return prediction["answers"]
|
128 |
|
requirements.txt
CHANGED
@@ -2,4 +2,3 @@ transformers==4.19.2
|
|
2 |
torch==1.10.2
|
3 |
#farm-haystack==1.5.0
|
4 |
farm-haystack[pinecone]==1.5.0
|
5 |
-
pinecone-client<=2.0.10
|
|
|
2 |
torch==1.10.2
|
3 |
#farm-haystack==1.5.0
|
4 |
farm-haystack[pinecone]==1.5.0
|
|