Jorge Henao commited on
Commit
9735086
1 Parent(s): 3af52d7

pipeline refactor with custom pineconer retriever

Browse files
__pycache__/pinecode_quieries.cpython-38.pyc CHANGED
Binary files a/__pycache__/pinecode_quieries.cpython-38.pyc and b/__pycache__/pinecode_quieries.cpython-38.pyc differ
 
app_pinecode.py CHANGED
@@ -4,6 +4,14 @@ import time
4
  from config import Config
5
  from pinecode_quieries import PinecodeProposalQueries
6
 
 
 
 
 
 
 
 
 
7
  extractive_query = PinecodeProposalQueries (es_host = Config.es_host, es_index = Config.proposals_index,
8
  es_user = Config.es_user, es_password = Config.es_password,
9
  reader_name_or_path = Config.reader_model_name_or_path,
 
4
  from config import Config
5
  from pinecode_quieries import PinecodeProposalQueries
6
 
7
+ import logging
8
+
9
+
10
+ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
11
+ logging.getLogger("haystack").setLevel(logging.INFO)
12
+
13
+ logging.info("This is a test log ..")
14
+
15
  extractive_query = PinecodeProposalQueries (es_host = Config.es_host, es_index = Config.proposals_index,
16
  es_user = Config.es_user, es_password = Config.es_password,
17
  reader_name_or_path = Config.reader_model_name_or_path,
pinecode_quieries.py CHANGED
@@ -1,10 +1,34 @@
 
1
  from abc import ABC, abstractmethod
2
  from haystack.nodes import BM25Retriever, FARMReader
3
  from haystack.document_stores import ElasticsearchDocumentStore
4
- from haystack.pipelines import ExtractiveQAPipeline
5
  from haystack.document_stores import PineconeDocumentStore
6
  from haystack.nodes import EmbeddingRetriever
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import certifi
9
  import datetime
10
  import requests
@@ -12,6 +36,36 @@ from base64 import b64encode
12
 
13
  ca_certs=certifi.where()
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  class DocumentQueries(ABC):
16
 
17
  @abstractmethod
@@ -28,34 +82,47 @@ class PinecodeProposalQueries(DocumentQueries):
28
  def _initialize_pipeline(self, es_host, es_index, es_user, es_password, reader = None):
29
  if reader is not None:
30
  self.reader = reader
31
- self.es_host = es_host
32
- self.es_user = es_user
33
- self.es_password = es_password
 
34
  self.document_store = PineconeDocumentStore(
35
  api_key=es_password,
36
  environment = "us-east1-gcp",
37
- index="semantic-text-search",
38
  similarity="cosine",
39
  embedding_dim=384
40
- )
41
- #self.retriever = BM25Retriever(document_store = self.document_store)
42
- self.retriever = EmbeddingRetriever(
43
- document_store=self.document_store,
44
- #embedding_model="multi-qa-distilbert-dot-v1",
45
- embedding_model = "sentence-transformers/msmarco-MiniLM-L6-cos-v5",
46
- model_format="sentence_transformers"
47
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
49
- self.pipe = ExtractiveQAPipeline(self.reader, self.retriever)
 
50
 
51
  def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None) :
52
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
53
 
54
- #self.log.write_log(query, "hfspace-informecomision")
55
  #if es_index is not None:
56
  #self._initialize_pipeline(self.es_host, es_index, self.es_user, self.es_password)
57
- #params = {"Retriever": {"top_k": retriever_top_k}, "Reader": {"top_k": reader_top_k}}
58
- params = {"Retriever": {"top_k": retriever_top_k}}
59
  prediction = self.pipe.run( query = query, params = params)
60
  return prediction["answers"]
61
 
 
1
+
2
  from abc import ABC, abstractmethod
3
  from haystack.nodes import BM25Retriever, FARMReader
4
  from haystack.document_stores import ElasticsearchDocumentStore
5
+ from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline
6
  from haystack.document_stores import PineconeDocumentStore
7
  from haystack.nodes import EmbeddingRetriever
8
 
9
+ import json
10
+ import logging
11
+ import os
12
+ import shutil
13
+ import sys
14
+ import uuid
15
+ from json import JSONDecodeError
16
+ from pathlib import Path
17
+ from typing import List, Optional
18
+
19
+ import pandas as pd
20
+ import pinecone
21
+ import streamlit as st
22
+
23
+ from haystack import BaseComponent, Document
24
+ from haystack.document_stores import PineconeDocumentStore
25
+ from haystack.nodes import (
26
+ EmbeddingRetriever,
27
+ FARMReader
28
+ )
29
+ from haystack.pipelines import ExtractiveQAPipeline, Pipeline
30
+ from sentence_transformers import SentenceTransformer
31
+
32
  import certifi
33
  import datetime
34
  import requests
 
36
 
37
  ca_certs=certifi.where()
38
 
39
+ class PineconeRetriever(BaseComponent):
40
+ outgoing_edges = 1
41
+
42
+ def __init__(self, sentence_transformer_name: str, api_key:str, environment: str, index_name: str):
43
+ # a small subset of the component's parameters is sent in an event after applying filters defined in haystack.telemetry.NonPrivateParameters
44
+ self.sts_model = SentenceTransformer(sentence_transformer_name)
45
+ pinecone.init(api_key = api_key, environment=environment)
46
+ self.index = pinecone.Index(index_name)
47
+
48
+ def run(self, query: str, top_k: Optional[int]):
49
+ # process the inputs
50
+ vector_embeddings = self.sts_model.encode(query).tolist()
51
+ response = self.index.query([vector_embeddings], top_k=top_k, include_metadata=True)
52
+ docs = [
53
+ Document(
54
+ content=d["metadata"]['content'],
55
+ meta={'title': d["metadata"]['title'],
56
+ 'page': d["metadata"]['page'],
57
+ 'source': d["metadata"]['source']
58
+ }
59
+ )
60
+ for d in response["matches"]
61
+ ]
62
+ output = {"documents": docs, "query": query}
63
+ return output, "output_1"
64
+
65
+ def run_batch(self, queries: List[str], top_k: Optional[int]):
66
+ return {}, "output_1"
67
+
68
+
69
  class DocumentQueries(ABC):
70
 
71
  @abstractmethod
 
82
  def _initialize_pipeline(self, es_host, es_index, es_user, es_password, reader = None):
83
  if reader is not None:
84
  self.reader = reader
85
+
86
+ #pinecone.init(api_key=es_password, environment="us-east1-gcp")
87
+ index_name = "semantic-text-search"
88
+
89
  self.document_store = PineconeDocumentStore(
90
  api_key=es_password,
91
  environment = "us-east1-gcp",
92
+ index=index_name,
93
  similarity="cosine",
94
  embedding_dim=384
 
 
 
 
 
 
 
95
  )
96
+ self.pipe = Pipeline()
97
+ pinecone_retriever = PineconeRetriever("sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
98
+ es_password, "us-east1-gcp",
99
+ index_name)
100
+ self.pipe.add_node(component=pinecone_retriever, name="Retriever", inputs=["Query"])
101
+ self.pipe.add_node(component=self.reader, name="Reader", inputs=["Retriever"])
102
+
103
+ # #self.retriever = BM25Retriever(document_store = self.document_store)
104
+ # self.retriever = EmbeddingRetriever(
105
+ # document_store=self.document_store,
106
+ # #embedding_model="multi-qa-distilbert-dot-v1",
107
+ # embedding_model = "sentence-transformers/msmarco-MiniLM-L6-cos-v5",
108
+ # model_format="sentence_transformers"
109
+ # )
110
+
111
+ # retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
112
+
113
+
114
+
115
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
116
+ #self.pipe = ExtractiveQAPipeline (reader = self.reader, retriever = self.retriever)
117
+ #self.pipe = DocumentSearchPipeline(self.retriever)
118
 
119
  def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None) :
120
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
121
 
 
122
  #if es_index is not None:
123
  #self._initialize_pipeline(self.es_host, es_index, self.es_user, self.es_password)
124
+ params = {"Retriever": {"top_k": retriever_top_k}, "Reader": {"top_k": reader_top_k}}
125
+ #params = {"Retriever": {"top_k": retriever_top_k}}
126
  prediction = self.pipe.run( query = query, params = params)
127
  return prediction["answers"]
128
 
requirements.txt CHANGED
@@ -2,4 +2,3 @@ transformers==4.19.2
2
  torch==1.10.2
3
  #farm-haystack==1.5.0
4
  farm-haystack[pinecone]==1.5.0
5
- pinecone-client<=2.0.10
 
2
  torch==1.10.2
3
  #farm-haystack==1.5.0
4
  farm-haystack[pinecone]==1.5.0