Weedoo's picture
add deletion of csv file and index created during a session
868658d
raw
history blame
4.48 kB
import pandas as pd
import arxiv
import requests
from pinecone import Pinecone, ServerlessSpec
import logging
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)
def get_zotero_ids(api_key, library_id, tag):
base_url = 'https://api.zotero.org'
suffix = '/users/'+ library_id +'/items?tag='+ tag
header = {'Authorization': 'Bearer '+ api_key}
request = requests.get(base_url + suffix, headers= header)
return [data['data']['archiveID'].replace('arXiv:', '') for data in request.json()]
def get_arxiv_papers(ids = None, category = None, comment = None):
logging.getLogger('arxiv').setLevel(logging.WARNING)
client = arxiv.Client()
if category is None:
search = arxiv.Search(
id_list= ids,
max_results= len(ids),
)
else :
if comment is None:
custom_query = f'cat:{category}'
else:
custom_query = f'cat:{category} AND co:{comment}'
search = arxiv.Search(
query = custom_query,
max_results= 15,
sort_by= arxiv.SortCriterion.SubmittedDate
)
if ids is None and category is None:
raise ValueError('not a valid query')
df = pd.DataFrame({'Title': [result.title for result in client.results(search)],
'Abstract': [result.summary.replace('\n', ' ') for result in client.results(search)],
'Date': [result.published.date().strftime('%Y-%m-%d') for result in client.results(search)],
'id': [result.entry_id for result in client.results(search)]})
if ids:
df.to_csv('arxiv-scrape.csv', index = False)
return df
def get_hf_embeddings(api_key, df):
title_abs = [title + '[SEP]' + abstract for title,abstract in zip(df['Title'], df['Abstract'])]
API_URL = "https://api-inference.huggingface.co/models/malteos/scincl"
headers = {"Authorization": f"Bearer {api_key}"}
response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "wait_for_model": False})
print(str(response.status_code) + 'This part needs an update, causing KeyError 0 ')
if response.status_code == 503:
response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "wait_for_model": True})
embeddings = response.json()
return embeddings, len(embeddings[0])
def upload_to_pinecone(api_key, index, namespace, embeddings, dim, df):
input = [{'id': df['id'][i], 'values': embeddings[i]} for i in range(len(embeddings))]
pc = Pinecone(api_key = api_key)
if index in pc.list_indexes().names():
while True:
logging.warning(f'Index name : {index} already exists.')
return f'Index name : {index} already exists'
pc.create_index(
name=index,
dimension=dim,
metric="cosine",
deletion_protection="disabled",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index)
return index.upsert(vectors=input, namespace=namespace)
def get_new_papers(df):
df_main = pd.read_csv('arxiv-scrape.csv')
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
union_df = df.merge(df_main, how='left', indicator=True)
df = union_df[union_df['_merge'] == 'left_only'].drop(columns=['_merge'])
if df.empty:
return 'No New Papers Found'
else:
df_main = pd.concat([df_main, df], ignore_index= True)
df_main.drop_duplicates(inplace= True)
df_main.to_csv('arxiv-scrape.csv', index = False)
return df
def recommend_papers(api_key, index, namespace, embeddings, df, threshold):
pc = Pinecone(api_key = api_key)
if index in pc.list_indexes().names():
index = pc.Index(index)
else:
raise ValueError(f"{index} doesnt exist. Project isnt initialized properly")
results = []
score_threshold = threshold
for i,embedding in enumerate(embeddings):
query = embedding
result = index.query(namespace=namespace,vector=query,top_k=3,include_values=False)
sum_score = sum(match['score'] for match in result['matches'])
if sum_score > score_threshold:
results.append(f"Paper-URL : [{df['id'][i]}]({df['id'][i]}) with score: {sum_score / 3} <br />")
if results:
return '\n'.join(results)
else:
return 'No Interesting Paper'