vtiyyal1 commited on
Commit
a084a92
·
verified ·
1 Parent(s): 59df1c3

Upload 2 files

Browse files

updated url mistake and solr back to old

Files changed (2) hide show
  1. app.py +1 -1
  2. get_articles.py +68 -89
app.py CHANGED
@@ -94,7 +94,7 @@ Be concise but informative. If a specific detail isn't in the content, say so ra
94
 
95
  url_prompt = """Generate a Tobacco Watcher article URL based on the query. Follow these rules:
96
 
97
- 1. Base URL: https://tobaccowatcher.globaltobactocontrol.org/articles/
98
  2. Parameters:
99
  - Subject (c=): Can have multiple
100
  - Product (pro=): Can have multiple
 
94
 
95
  url_prompt = """Generate a Tobacco Watcher article URL based on the query. Follow these rules:
96
 
97
+ 1. Base URL: https://tobaccowatcher.globaltobaccocontrol.org/articles/
98
  2. Parameters:
99
  - Subject (c=): Can have multiple
100
  - Product (pro=): Can have multiple
get_articles.py CHANGED
@@ -71,97 +71,76 @@ Minor details:
71
  """
72
  def save_solr_articles(keywords: str, num_articles=15) -> str:
73
  """Save top articles from Solr search to CSV."""
74
- try:
75
- solr_key = os.getenv("SOLR_KEY")
76
- SOLR_ARTICLES_URL = f"https://website:{solr_key}@solr.machines.globalhealthwatcher.org:8080/solr/articles/"
77
- solr = Solr(SOLR_ARTICLES_URL, verify=False)
78
-
79
- # No duplicates and must be in English
80
- fq = ['-dups:0', 'is_english:(true)']
81
-
82
- # Construct and sanitize query
83
- query = f'text:({keywords}) AND dead_url:(false)'
84
-
85
- print(f"Executing Solr query: {query}")
86
-
87
- # Use boost function to combine relevance score with recency
88
- # This gives higher weight to more recent articles while still considering relevance
89
- boost_query = "sum(score,product(0.3,recip(ms(NOW,year_month_day),3.16e-11,1,1)))"
90
-
91
- try:
92
- outputs = solr.search(
93
- query,
94
- fq=fq,
95
- sort=boost_query + " desc",
96
- rows=num_articles * 2,
97
- fl='*,score' # Include score in results
98
- )
99
- except Exception as e:
100
- print(f"Solr query failed: {str(e)}")
101
- raise
102
-
103
- article_count = 0
104
- save_path = os.path.join("data", "articles.csv")
105
- if not os.path.exists(os.path.dirname(save_path)):
106
- os.makedirs(os.path.dirname(save_path))
107
-
108
- with open(save_path, 'w', newline='') as csvfile:
109
- fieldnames = ['title', 'uuid', 'content', 'url', 'domain', 'published_date']
110
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
111
- writer.writeheader()
112
-
113
- title_five_words = set()
114
-
115
- for d in outputs.docs:
116
- if article_count == num_articles:
117
- break
118
-
119
- # Skip if required fields are missing
120
- if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d:
121
  continue
 
122
 
123
- title_cleaned = remove_spaces_newlines(d['title'])
124
-
125
- # Skip duplicate titles based on first five words
126
- split = title_cleaned.split()
127
- if len(split) >= 5:
128
- five_words = ' '.join(split[:5])
129
- if five_words in title_five_words:
130
- continue
131
- title_five_words.add(five_words)
132
-
133
- article_count += 1
134
-
135
- cleaned_content = remove_spaces_newlines(d['cleaned_content'])
136
- cleaned_content = truncate_article(cleaned_content)
137
-
138
- domain = d.get('domain', "Not Specified")
139
- raw_date = d.get('year_month_day', "Unknown Date")
140
-
141
- # Format the date
142
- if raw_date != "Unknown Date":
143
- try:
144
- publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
145
- except ValueError:
146
- publication_date = "Invalid Date"
147
- else:
148
- publication_date = raw_date
149
-
150
- writer.writerow({
151
- 'title': title_cleaned,
152
- 'uuid': d['uuid'],
153
- 'content': cleaned_content,
154
- 'url': d['url'],
155
- 'domain': domain,
156
- 'published_date': publication_date
157
- })
158
- print(f"Article saved: {title_cleaned}, {d['uuid']}, {domain}, {publication_date}")
159
-
160
- return save_path
161
-
162
- except Exception as e:
163
- print(f"Error in save_solr_articles: {str(e)}")
164
- raise
165
 
166
 
167
  def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15):
 
71
  """
72
  def save_solr_articles(keywords: str, num_articles=15) -> str:
73
  """Save top articles from Solr search to CSV."""
74
+ solr_key = os.getenv("SOLR_KEY")
75
+ SOLR_ARTICLES_URL = f"https://website:{solr_key}@solr.machines.globalhealthwatcher.org:8080/solr/articles/"
76
+ solr = Solr(SOLR_ARTICLES_URL, verify=False)
77
+
78
+ # No duplicates
79
+ fq = ['-dups:0']
80
+
81
+ query = f'text:({keywords})' + " AND " + "dead_url:(false)"
82
+
83
+ # Get top 2*num_articles articles and then remove misformed or duplicate articles
84
+ outputs = solr.search(query, fq=fq, sort="score desc", rows=num_articles * 2)
85
+
86
+ article_count = 0
87
+
88
+ save_path = os.path.join("data", "articles.csv")
89
+ if not os.path.exists(os.path.dirname(save_path)):
90
+ os.makedirs(os.path.dirname(save_path))
91
+
92
+ with open(save_path, 'w', newline='') as csvfile:
93
+ fieldnames = ['title', 'uuid', 'content', 'url', 'domain', 'published_date']
94
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
95
+ writer.writeheader()
96
+
97
+ title_five_words = set()
98
+
99
+ for d in outputs.docs:
100
+ if article_count == num_articles:
101
+ break
102
+
103
+ # skip if title returns a keyerror
104
+ if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d:
105
+ continue
106
+
107
+ title_cleaned = remove_spaces_newlines(d['title'])
108
+
109
+ split = title_cleaned.split()
110
+ # skip if title is a duplicate
111
+ if not len(split) < 5:
112
+ five_words = title_cleaned.split()[:5]
113
+ five_words = ' '.join(five_words)
114
+ if five_words in title_five_words:
 
 
 
 
 
 
115
  continue
116
+ title_five_words.add(five_words)
117
 
118
+ article_count += 1
119
+
120
+ cleaned_content = remove_spaces_newlines(d['cleaned_content'])
121
+ cleaned_content = truncate_article(cleaned_content)
122
+
123
+ domain = ""
124
+ if 'domain' not in d:
125
+ domain = "Not Specified"
126
+ else:
127
+ domain = d['domain']
128
+
129
+ raw_date = d.get('year_month_day', "Unknown Date")
130
+
131
+ # Format the date from YYYY-MM-DD to MM/DD/YYYY if available
132
+ if raw_date != "Unknown Date":
133
+ try:
134
+ publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
135
+ except ValueError:
136
+ publication_date = "Invalid Date"
137
+ else:
138
+ publication_date = raw_date
139
+
140
+ writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
141
+ 'domain': domain, 'published_date': publication_date})
142
+
143
+ return save_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
 
146
  def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15):