from pysolr import Solr import os import csv from sentence_transformers import SentenceTransformer, util import torch from datetime import datetime from get_keywords import get_keywords import os import re """ This function creates top 15 articles from Solr and saves them in a csv file Input: query: str num_articles: int keyword_type: str (openai, rake, or na) Output: path to csv file """ def sanitize_query(text): """Sanitize the query text for Solr.""" # Remove special characters that could break Solr syntax sanitized = re.sub(r'[[\]{}()*+?\\^|;:!]', ' ', text) # Normalize whitespace sanitized = ' '.join(sanitized.split()) return sanitized def save_solr_articles_full(query: str, num_articles: int, keyword_type: str = "openai") -> str: try: keywords = get_keywords(query, keyword_type) if keyword_type == "na": keywords = query # Sanitize keywords before creating Solr query keywords = sanitize_query(keywords) return save_solr_articles(keywords, num_articles) except Exception as e: raise """ Removes spaces and newlines from text Input: text: str Output: text: str """ def remove_spaces_newlines(text: str) -> str: text = text.replace('\n', ' ') text = text.replace(' ', ' ') return text # truncates long articles to 1500 words def truncate_article(text: str) -> str: split = text.split() if len(split) > 1500: split = split[:1500] text = ' '.join(split) return text """ Searches Solr for articles based on keywords and saves them in a csv file Input: keywords: str num_articles: int Output: path to csv file Minor details: Removes duplicate articles to start with. Articles with dead urls are removed since those articles are often wierd. Articles with titles that start with five starting words are removed. they are usually duplicates with minor changes. If one of title, uuid, cleaned_content, url are missing the article is skipped. """ def save_solr_articles(keywords: str, num_articles=15) -> str: """Save top articles from Solr search to CSV.""" try: solr_key = os.getenv("SOLR_KEY") SOLR_ARTICLES_URL = f"https://website:{solr_key}@solr.machines.globalhealthwatcher.org:8080/solr/articles/" solr = Solr(SOLR_ARTICLES_URL, verify=False) # No duplicates and must be in English fq = ['-dups:0', 'is_english:(true)'] # Construct and sanitize query query = f'text:({keywords}) AND dead_url:(false)' print(f"Executing Solr query: {query}") # Use boost function to combine relevance score with recency # This gives higher weight to more recent articles while still considering relevance boost_query = "sum(score,product(0.3,recip(ms(NOW,year_month_day),3.16e-11,1,1)))" try: outputs = solr.search( query, fq=fq, sort=boost_query + " desc", rows=num_articles * 2, fl='*,score' # Include score in results ) except Exception as e: print(f"Solr query failed: {str(e)}") raise article_count = 0 save_path = os.path.join("data", "articles.csv") if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) with open(save_path, 'w', newline='') as csvfile: fieldnames = ['title', 'uuid', 'content', 'url', 'domain', 'published_date'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC) writer.writeheader() title_five_words = set() for d in outputs.docs: if article_count == num_articles: break # Skip if required fields are missing if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d: continue title_cleaned = remove_spaces_newlines(d['title']) # Skip duplicate titles based on first five words split = title_cleaned.split() if len(split) >= 5: five_words = ' '.join(split[:5]) if five_words in title_five_words: continue title_five_words.add(five_words) article_count += 1 cleaned_content = remove_spaces_newlines(d['cleaned_content']) cleaned_content = truncate_article(cleaned_content) domain = d.get('domain', "Not Specified") raw_date = d.get('year_month_day', "Unknown Date") # Format the date if raw_date != "Unknown Date": try: publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y") except ValueError: publication_date = "Invalid Date" else: publication_date = raw_date writer.writerow({ 'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'], 'domain': domain, 'published_date': publication_date }) print(f"Article saved: {title_cleaned}, {d['uuid']}, {domain}, {publication_date}") return save_path except Exception as e: print(f"Error in save_solr_articles: {str(e)}") raise def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15): bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') query_embedding = bi_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(query_embedding, article_embeddings, top_k=15) hits = hits[0] corpus_ids = [item['corpus_id'] for item in hits] r_contents = [contents[idx] for idx in corpus_ids] r_titles = [titles[idx] for idx in corpus_ids] r_uuids = [uuids[idx] for idx in corpus_ids] r_urls = [urls[idx] for idx in corpus_ids] save_path = os.path.join("data", "articles.csv") if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) with open(save_path, 'w', newline='', encoding="utf-8") as csvfile: fieldNames = ['title', 'uuid', 'content', 'url'] writer = csv.DictWriter(csvfile, fieldnames=fieldNames, quoting=csv.QUOTE_NONNUMERIC) writer.writeheader() for i in range(num_articles): writer.writerow({'title': r_titles[i], 'uuid': r_uuids[i], 'content': r_contents[i], 'url': r_urls[i]}) return save_path