Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Sleeping

App Files Files Community

vtiyyal1 commited on Dec 17, 2024

Commit

a084a92

verified ·

1 Parent(s): 59df1c3

Upload 2 files

Browse files

updated url mistake and solr back to old

Files changed (2) hide show

app.py +1 -1
get_articles.py +68 -89

app.py CHANGED Viewed

@@ -94,7 +94,7 @@ Be concise but informative. If a specific detail isn't in the content, say so ra
             url_prompt = """Generate a Tobacco Watcher article URL based on the query. Follow these rules:
-            1. Base URL: https://tobaccowatcher.globaltobactocontrol.org/articles/
             2. Parameters:
                - Subject (c=): Can have multiple
                - Product (pro=): Can have multiple

             url_prompt = """Generate a Tobacco Watcher article URL based on the query. Follow these rules:
+            1. Base URL: https://tobaccowatcher.globaltobaccocontrol.org/articles/
             2. Parameters:
                - Subject (c=): Can have multiple
                - Product (pro=): Can have multiple

get_articles.py CHANGED Viewed

@@ -71,97 +71,76 @@ Minor details:
 """
 def save_solr_articles(keywords: str, num_articles=15) -> str:
     """Save top articles from Solr search to CSV."""
-    try:
-        solr_key = os.getenv("SOLR_KEY")
-        SOLR_ARTICLES_URL = f"https://website:{solr_key}@solr.machines.globalhealthwatcher.org:8080/solr/articles/"
-        solr = Solr(SOLR_ARTICLES_URL, verify=False)
-        # No duplicates and must be in English
-        fq = ['-dups:0', 'is_english:(true)']
-        # Construct and sanitize query
-        query = f'text:({keywords}) AND dead_url:(false)'
-        print(f"Executing Solr query: {query}")
-        # Use boost function to combine relevance score with recency
-        # This gives higher weight to more recent articles while still considering relevance
-        boost_query = "sum(score,product(0.3,recip(ms(NOW,year_month_day),3.16e-11,1,1)))"
-        try:
-            outputs = solr.search(
-                query,
-                fq=fq,
-                sort=boost_query + " desc",
-                rows=num_articles * 2,
-                fl='*,score'  # Include score in results
-            )
-        except Exception as e:
-            print(f"Solr query failed: {str(e)}")
-            raise
-        article_count = 0
-        save_path = os.path.join("data", "articles.csv")
-        if not os.path.exists(os.path.dirname(save_path)):
-            os.makedirs(os.path.dirname(save_path))
-        with open(save_path, 'w', newline='') as csvfile:
-            fieldnames = ['title', 'uuid', 'content', 'url', 'domain', 'published_date']
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
-            writer.writeheader()
-            title_five_words = set()
-            for d in outputs.docs:
-                if article_count == num_articles:
-                    break
-                # Skip if required fields are missing
-                if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d:
                     continue
-                title_cleaned = remove_spaces_newlines(d['title'])
-                # Skip duplicate titles based on first five words
-                split = title_cleaned.split()
-                if len(split) >= 5:
-                    five_words = ' '.join(split[:5])
-                    if five_words in title_five_words:
-                        continue
-                    title_five_words.add(five_words)
-                article_count += 1
-                cleaned_content = remove_spaces_newlines(d['cleaned_content'])
-                cleaned_content = truncate_article(cleaned_content)
-                domain = d.get('domain', "Not Specified")
-                raw_date = d.get('year_month_day', "Unknown Date")
-                # Format the date
-                if raw_date != "Unknown Date":
-                    try:
-                        publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
-                    except ValueError:
-                        publication_date = "Invalid Date"
-                else:
-                    publication_date = raw_date
-                writer.writerow({
-                    'title': title_cleaned,
-                    'uuid': d['uuid'],
-                    'content': cleaned_content,
-                    'url': d['url'],
-                    'domain': domain,
-                    'published_date': publication_date
-                })
-                print(f"Article saved: {title_cleaned}, {d['uuid']}, {domain}, {publication_date}")
-        return save_path
-    except Exception as e:
-        print(f"Error in save_solr_articles: {str(e)}")
-        raise
 def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15):

 """
 def save_solr_articles(keywords: str, num_articles=15) -> str:
     """Save top articles from Solr search to CSV."""
+    solr_key = os.getenv("SOLR_KEY")
+    SOLR_ARTICLES_URL = f"https://website:{solr_key}@solr.machines.globalhealthwatcher.org:8080/solr/articles/"
+    solr = Solr(SOLR_ARTICLES_URL, verify=False)
+    # No duplicates
+    fq = ['-dups:0']
+    query = f'text:({keywords})' + " AND " + "dead_url:(false)"
+    # Get top 2*num_articles articles and then remove misformed or duplicate articles
+    outputs = solr.search(query, fq=fq, sort="score desc", rows=num_articles * 2)
+    article_count = 0
+    save_path = os.path.join("data", "articles.csv")
+    if not os.path.exists(os.path.dirname(save_path)):
+        os.makedirs(os.path.dirname(save_path))
+    with open(save_path, 'w', newline='') as csvfile:
+        fieldnames = ['title', 'uuid', 'content', 'url', 'domain', 'published_date']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
+        writer.writeheader()
+        title_five_words = set()
+        for d in outputs.docs:
+            if article_count == num_articles:
+                break
+            # skip if title returns a keyerror
+            if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d:
+                continue
+            title_cleaned = remove_spaces_newlines(d['title'])
+            split = title_cleaned.split()
+            # skip if title is a duplicate
+            if not len(split) < 5:
+                five_words = title_cleaned.split()[:5]
+                five_words = ' '.join(five_words)
+                if five_words in title_five_words:
                     continue
+                title_five_words.add(five_words)
+            article_count += 1
+            cleaned_content = remove_spaces_newlines(d['cleaned_content'])
+            cleaned_content = truncate_article(cleaned_content)
+            domain = ""
+            if 'domain' not in d:
+                domain = "Not Specified"
+            else:
+                domain = d['domain']
+            raw_date = d.get('year_month_day', "Unknown Date")
+            # Format the date from YYYY-MM-DD to MM/DD/YYYY if available
+            if raw_date != "Unknown Date":
+                try:
+                    publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
+                except ValueError:
+                    publication_date = "Invalid Date"
+            else:
+                publication_date = raw_date
+            writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
+                           'domain': domain, 'published_date': publication_date})
+    return save_path
 def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15):