Upload 2 files
Browse filesupdated url mistake and solr back to old
- app.py +1 -1
- get_articles.py +68 -89
app.py
CHANGED
@@ -94,7 +94,7 @@ Be concise but informative. If a specific detail isn't in the content, say so ra
|
|
94 |
|
95 |
url_prompt = """Generate a Tobacco Watcher article URL based on the query. Follow these rules:
|
96 |
|
97 |
-
1. Base URL: https://tobaccowatcher.
|
98 |
2. Parameters:
|
99 |
- Subject (c=): Can have multiple
|
100 |
- Product (pro=): Can have multiple
|
|
|
94 |
|
95 |
url_prompt = """Generate a Tobacco Watcher article URL based on the query. Follow these rules:
|
96 |
|
97 |
+
1. Base URL: https://tobaccowatcher.globaltobaccocontrol.org/articles/
|
98 |
2. Parameters:
|
99 |
- Subject (c=): Can have multiple
|
100 |
- Product (pro=): Can have multiple
|
get_articles.py
CHANGED
@@ -71,97 +71,76 @@ Minor details:
|
|
71 |
"""
|
72 |
def save_solr_articles(keywords: str, num_articles=15) -> str:
|
73 |
"""Save top articles from Solr search to CSV."""
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
for d in outputs.docs:
|
116 |
-
if article_count == num_articles:
|
117 |
-
break
|
118 |
-
|
119 |
-
# Skip if required fields are missing
|
120 |
-
if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d:
|
121 |
continue
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
writer.writerow({
|
151 |
-
'title': title_cleaned,
|
152 |
-
'uuid': d['uuid'],
|
153 |
-
'content': cleaned_content,
|
154 |
-
'url': d['url'],
|
155 |
-
'domain': domain,
|
156 |
-
'published_date': publication_date
|
157 |
-
})
|
158 |
-
print(f"Article saved: {title_cleaned}, {d['uuid']}, {domain}, {publication_date}")
|
159 |
-
|
160 |
-
return save_path
|
161 |
-
|
162 |
-
except Exception as e:
|
163 |
-
print(f"Error in save_solr_articles: {str(e)}")
|
164 |
-
raise
|
165 |
|
166 |
|
167 |
def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15):
|
|
|
71 |
"""
|
72 |
def save_solr_articles(keywords: str, num_articles=15) -> str:
|
73 |
"""Save top articles from Solr search to CSV."""
|
74 |
+
solr_key = os.getenv("SOLR_KEY")
|
75 |
+
SOLR_ARTICLES_URL = f"https://website:{solr_key}@solr.machines.globalhealthwatcher.org:8080/solr/articles/"
|
76 |
+
solr = Solr(SOLR_ARTICLES_URL, verify=False)
|
77 |
+
|
78 |
+
# No duplicates
|
79 |
+
fq = ['-dups:0']
|
80 |
+
|
81 |
+
query = f'text:({keywords})' + " AND " + "dead_url:(false)"
|
82 |
+
|
83 |
+
# Get top 2*num_articles articles and then remove misformed or duplicate articles
|
84 |
+
outputs = solr.search(query, fq=fq, sort="score desc", rows=num_articles * 2)
|
85 |
+
|
86 |
+
article_count = 0
|
87 |
+
|
88 |
+
save_path = os.path.join("data", "articles.csv")
|
89 |
+
if not os.path.exists(os.path.dirname(save_path)):
|
90 |
+
os.makedirs(os.path.dirname(save_path))
|
91 |
+
|
92 |
+
with open(save_path, 'w', newline='') as csvfile:
|
93 |
+
fieldnames = ['title', 'uuid', 'content', 'url', 'domain', 'published_date']
|
94 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
|
95 |
+
writer.writeheader()
|
96 |
+
|
97 |
+
title_five_words = set()
|
98 |
+
|
99 |
+
for d in outputs.docs:
|
100 |
+
if article_count == num_articles:
|
101 |
+
break
|
102 |
+
|
103 |
+
# skip if title returns a keyerror
|
104 |
+
if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d:
|
105 |
+
continue
|
106 |
+
|
107 |
+
title_cleaned = remove_spaces_newlines(d['title'])
|
108 |
+
|
109 |
+
split = title_cleaned.split()
|
110 |
+
# skip if title is a duplicate
|
111 |
+
if not len(split) < 5:
|
112 |
+
five_words = title_cleaned.split()[:5]
|
113 |
+
five_words = ' '.join(five_words)
|
114 |
+
if five_words in title_five_words:
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
continue
|
116 |
+
title_five_words.add(five_words)
|
117 |
|
118 |
+
article_count += 1
|
119 |
+
|
120 |
+
cleaned_content = remove_spaces_newlines(d['cleaned_content'])
|
121 |
+
cleaned_content = truncate_article(cleaned_content)
|
122 |
+
|
123 |
+
domain = ""
|
124 |
+
if 'domain' not in d:
|
125 |
+
domain = "Not Specified"
|
126 |
+
else:
|
127 |
+
domain = d['domain']
|
128 |
+
|
129 |
+
raw_date = d.get('year_month_day', "Unknown Date")
|
130 |
+
|
131 |
+
# Format the date from YYYY-MM-DD to MM/DD/YYYY if available
|
132 |
+
if raw_date != "Unknown Date":
|
133 |
+
try:
|
134 |
+
publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
|
135 |
+
except ValueError:
|
136 |
+
publication_date = "Invalid Date"
|
137 |
+
else:
|
138 |
+
publication_date = raw_date
|
139 |
+
|
140 |
+
writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
|
141 |
+
'domain': domain, 'published_date': publication_date})
|
142 |
+
|
143 |
+
return save_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
|
146 |
def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15):
|