Spaces:

Nandan1377
/

Grant_Ranking_PUBMED

Sleeping

App Files Files Community

Grant_Ranking_PUBMED / app.py

Nandan1377

Update app.py

f9eb82f 11 months ago

raw history blame contribute delete

No virus

3.91 kB

	import streamlit as st
	import pandas as pd
	from transformers import AutoTokenizer, AutoModel
	# Use a pipeline as a high-level helper
	from transformers import pipeline
	import csv
	import torch

	pipe = pipeline("fill-mask", model="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")# Load the PubMedBERT tokenizer and model directly
	tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
	model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

	# Step 1: Read CSV and extract relevant rows for the first ten data points
	csv_filename = "new.csv"
	grants_data = []
	i =0
	with open(csv_filename, "r", encoding="utf-8") as csv_file:
	csv_reader = csv.DictReader(csv_file)
	for i, row in enumerate(csv_reader):
	if int(row["postdate"]) == 2023: # Only consider grants from 2023
	grants_data.append(row)
	if len(grants_data) == 3000: # Stop after collecting ten grants
	break

	# Streamlit app
	st.title("Grants Ranking System")

	# Step 2: Define the reference text using a text input widget
	reference_text = st.text_input("Enter the reference text here:")

	# Calculate similarity scores for each grant
	if st.button("Calculate Similarity"):
	similarity_scores = []

	for grant_data in grants_data:
	i =i+1
	print(i)

	grant_description = grant_data["opportunitytitle"][:3000] # Truncate the description to reduce tokens

	# Tokenize the reference text and grant description
	inputs1 = tokenizer(reference_text, return_tensors="pt", padding=True, truncation=True)
	inputs2 = tokenizer(grant_description, return_tensors="pt", padding=True, truncation=True)

	# Calculate embeddings for reference text and grant description
	with torch.no_grad():
	outputs1 = model(**inputs1)
	outputs2 = model(**inputs2)

	embeddings1 = outputs1.last_hidden_state.mean(dim=1)
	embeddings2 = outputs2.last_hidden_state.mean(dim=1)

	# Calculate similarity score using dot product of embeddings
	similarity_score = torch.matmul(embeddings1, embeddings2.transpose(0, 1)).item()
	similarity_scores.append(similarity_score)

	# Step 4: Sort grants based on similarity scores and assign ranks
	sorted_indices = sorted(range(len(similarity_scores)), key=lambda k: similarity_scores[k], reverse=True)
	sorted_grants_data = [grants_data[i] for i in sorted_indices]

	# Step 5: Assign ranks to the grants based on similarity scores
	ranks = list(range(1, len(sorted_grants_data) + 1))

	# Step 6: Create a new DataFrame with ranks and similarity scores
	ranked_grants_df = pd.DataFrame(sorted_grants_data)
	ranked_grants_df["Similarity_Score"] = similarity_scores
	ranked_grants_df["Rank"] = ranks

	# Step 7: Display the ranked grants DataFrame in the Streamlit app
	st.write(ranked_grants_df)

	# Step 8: Add a radio button widget to get user input on the search quality
	feedback = st.radio("How do you rate this search?", ("Good", "Ok", "Bad"))

	# Step 9: Generate a unique identifier for this search
	search_id = str(uuid.uuid4())

	# Step 10: Save the reference text along with the feedback to a CSV file with the unique identifier
	reference_text_filename = f"reference_text_{search_id}.csv"
	reference_text_df = pd.DataFrame({
	"Search_ID": [search_id],
	"Reference_Text": [reference_text],
	"Feedback": [feedback]
	})
	reference_text_df.to_csv(reference_text_filename, index=False)

	# Step 11: Save the ranked grants DataFrame to a CSV file with the unique identifier
	ranked_grants_filename = f"ranked_grants_{search_id}.csv"
	ranked_grants_df.to_csv(ranked_grants_filename, index=False)

	st.success("Reference text and ranked grants saved to CSV files.")