Spaces:

mtyrrell
/

cpv_poc

Running

mtyrrell commited on Aug 7, 2023

Commit

f5548b0

•

1 Parent(s): d7e50c8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -45,14 +45,15 @@ examples = [
 ]
 def get_docs(input_query, country = None):
-    # Construct a hacky query to focus the retriever on the target country (see notes below)
     if country:
         query = "For the country of "+country+", "+input_query
     else:
         query = input_query
-    # Get top 150 because we want to make sure we have 10 pertaining to the selected country
-    # TEMP SOLUTION: not ideal, but FAISS document store doesnt allow metadata filtering. Needs to be tested with the full dataset
     docs = retriever.retrieve(query=query,top_k = 150)
     # Break out the key fields and convert to pandas for filtering
     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs]
@@ -61,8 +62,9 @@ def get_docs(input_query, country = None):
       df_docs = df_docs.query('country in @country')
     # Take the top 10
     df_docs = df_docs.head(10)
     df_docs = df_docs.reset_index()
-    df_docs['ref_id'] = df_docs.index + 1
     # Convert back to Document format
     ls_dict = []
     # Iterate over df and add relevant fields to the dict object
@@ -72,7 +74,7 @@ def get_docs(input_query, country = None):
             row['content'],
             meta={'country': row['country'],'document_name': row['document'], 'ref_id': row['ref_id'], 'score': row['score']}
         )
         # Append the Document object to the documents list
         ls_dict.append(doc)
     return(ls_dict)

 ]
 def get_docs(input_query, country = None):
+    '''
+    Construct a hacky query to focus the retriever on the target country (see notes below)
+    We take the top 150 k because we want to make sure we have 10 pertaining to the selected country
+    '''
     if country:
         query = "For the country of "+country+", "+input_query
     else:
         query = input_query
+    # Retrieve top k documents
     docs = retriever.retrieve(query=query,top_k = 150)
     # Break out the key fields and convert to pandas for filtering
     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs]
       df_docs = df_docs.query('country in @country')
     # Take the top 10
     df_docs = df_docs.head(10)
+    # Get ourselves an index setup from which to base the source reference number from (in the prompt and matching afterwards)
     df_docs = df_docs.reset_index()
+    df_docs['ref_id'] = df_docs.index + 1 # start the index at 1
     # Convert back to Document format
     ls_dict = []
     # Iterate over df and add relevant fields to the dict object
             row['content'],
             meta={'country': row['country'],'document_name': row['document'], 'ref_id': row['ref_id'], 'score': row['score']}
         )
         # Append the Document object to the documents list
         ls_dict.append(doc)
     return(ls_dict)