Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from transformers import AutoTokenizer, AutoModel | |
# Load the PubMedBERT tokenizer and model directly | |
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext") | |
model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext") | |
# Step 1: Read CSV and extract relevant rows for the first ten data points | |
csv_filename = "new.csv" | |
grants_data = [] | |
with open(csv_filename, "r", encoding="utf-8") as csv_file: | |
csv_reader = csv.DictReader(csv_file) | |
for i, row in enumerate(csv_reader): | |
if int(row["postdate"]) == 2023: # Only consider grants from 2023 | |
grants_data.append(row) | |
if len(grants_data) == 3000: # Stop after collecting ten grants | |
break | |
# Streamlit app | |
st.title("Grants Ranking System") | |
# Step 2: Define the reference text using a text input widget | |
reference_text = st.text_input("Enter the reference text here:") | |
# Calculate similarity scores for each grant | |
if st.button("Calculate Similarity"): | |
similarity_scores = [] | |
for grant_data in grants_data: | |
grant_description = grant_data["opportunitytitle"][:3000] # Truncate the description to reduce tokens | |
# Tokenize the reference text and grant description | |
inputs1 = tokenizer(reference_text, return_tensors="pt", padding=True, truncation=True) | |
inputs2 = tokenizer(grant_description, return_tensors="pt", padding=True, truncation=True) | |
# Calculate embeddings for reference text and grant description | |
with torch.no_grad(): | |
outputs1 = model(**inputs1) | |
outputs2 = model(**inputs2) | |
embeddings1 = outputs1.last_hidden_state.mean(dim=1) | |
embeddings2 = outputs2.last_hidden_state.mean(dim=1) | |
# Calculate similarity score using dot product of embeddings | |
similarity_score = torch.matmul(embeddings1, embeddings2.transpose(0, 1)).item() | |
similarity_scores.append(similarity_score) | |
# Step 4: Sort grants based on similarity scores and assign ranks | |
sorted_indices = sorted(range(len(similarity_scores)), key=lambda k: similarity_scores[k], reverse=True) | |
sorted_grants_data = [grants_data[i] for i in sorted_indices] | |
# Step 5: Assign ranks to the grants based on similarity scores | |
ranks = list(range(1, len(sorted_grants_data) + 1)) | |
# Step 6: Create a new DataFrame with ranks and similarity scores | |
ranked_grants_df = pd.DataFrame(sorted_grants_data) | |
ranked_grants_df["Similarity_Score"] = similarity_scores | |
ranked_grants_df["Rank"] = ranks | |
# Step 7: Display the ranked grants DataFrame in the Streamlit app | |
st.write(ranked_grants_df) | |
# Step 8: Add a radio button widget to get user input on the search quality | |
feedback = st.radio("How do you rate this search?", ("Good", "Ok", "Bad")) | |
# Step 9: Generate a unique identifier for this search | |
search_id = str(uuid.uuid4()) | |
# Step 10: Save the reference text along with the feedback to a CSV file with the unique identifier | |
reference_text_filename = f"reference_text_{search_id}.csv" | |
reference_text_df = pd.DataFrame({ | |
"Search_ID": [search_id], | |
"Reference_Text": [reference_text], | |
"Feedback": [feedback] | |
}) | |
reference_text_df.to_csv(reference_text_filename, index=False) | |
# Step 11: Save the ranked grants DataFrame to a CSV file with the unique identifier | |
ranked_grants_filename = f"ranked_grants_{search_id}.csv" | |
ranked_grants_df.to_csv(ranked_grants_filename, index=False) | |
st.success("Reference text and ranked grants saved to CSV files.") | |