Spaces:

Nandan1377
/

Grant_Ranking_PUBMED

Sleeping

App Files Files Community

Nandan1377 commited on Jul 30, 2023

Commit

b35ebeb

•

1 Parent(s): 6ef4921

Create app.py

Browse files

Files changed (1) hide show

app.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import streamlit as st
+import pandas as pd
+from transformers import AutoTokenizer, AutoModel
+# Load the PubMedBERT tokenizer and model directly
+tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
+model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
+# Step 1: Read CSV and extract relevant rows for the first ten data points
+csv_filename = "new.csv"
+grants_data = []
+with open(csv_filename, "r", encoding="utf-8") as csv_file:
+    csv_reader = csv.DictReader(csv_file)
+    for i, row in enumerate(csv_reader):
+        if int(row["postdate"]) == 2023:  # Only consider grants from 2023
+            grants_data.append(row)
+            if len(grants_data) == 3000:  # Stop after collecting ten grants
+                break
+# Streamlit app
+st.title("Grants Ranking System")
+# Step 2: Define the reference text using a text input widget
+reference_text = st.text_input("Enter the reference text here:")
+# Calculate similarity scores for each grant
+if st.button("Calculate Similarity"):
+    similarity_scores = []
+    for grant_data in grants_data:
+        grant_description = grant_data["opportunitytitle"][:3000]  # Truncate the description to reduce tokens
+        # Tokenize the reference text and grant description
+        inputs1 = tokenizer(reference_text, return_tensors="pt", padding=True, truncation=True)
+        inputs2 = tokenizer(grant_description, return_tensors="pt", padding=True, truncation=True)
+        # Calculate embeddings for reference text and grant description
+        with torch.no_grad():
+            outputs1 = model(**inputs1)
+            outputs2 = model(**inputs2)
+        embeddings1 = outputs1.last_hidden_state.mean(dim=1)
+        embeddings2 = outputs2.last_hidden_state.mean(dim=1)
+        # Calculate similarity score using dot product of embeddings
+        similarity_score = torch.matmul(embeddings1, embeddings2.transpose(0, 1)).item()
+        similarity_scores.append(similarity_score)
+    # Step 4: Sort grants based on similarity scores and assign ranks
+    sorted_indices = sorted(range(len(similarity_scores)), key=lambda k: similarity_scores[k], reverse=True)
+    sorted_grants_data = [grants_data[i] for i in sorted_indices]
+    # Step 5: Assign ranks to the grants based on similarity scores
+    ranks = list(range(1, len(sorted_grants_data) + 1))
+    # Step 6: Create a new DataFrame with ranks and similarity scores
+    ranked_grants_df = pd.DataFrame(sorted_grants_data)
+    ranked_grants_df["Similarity_Score"] = similarity_scores
+    ranked_grants_df["Rank"] = ranks
+    # Step 7: Display the ranked grants DataFrame in the Streamlit app
+    st.write(ranked_grants_df)
+    # Step 8: Add a radio button widget to get user input on the search quality
+    feedback = st.radio("How do you rate this search?", ("Good", "Ok", "Bad"))
+    # Step 9: Generate a unique identifier for this search
+    search_id = str(uuid.uuid4())
+    # Step 10: Save the reference text along with the feedback to a CSV file with the unique identifier
+    reference_text_filename = f"reference_text_{search_id}.csv"
+    reference_text_df = pd.DataFrame({
+        "Search_ID": [search_id],
+        "Reference_Text": [reference_text],
+        "Feedback": [feedback]
+    })
+    reference_text_df.to_csv(reference_text_filename, index=False)
+    # Step 11: Save the ranked grants DataFrame to a CSV file with the unique identifier
+    ranked_grants_filename = f"ranked_grants_{search_id}.csv"
+    ranked_grants_df.to_csv(ranked_grants_filename, index=False)
+    st.success("Reference text and ranked grants saved to CSV files.")