Ozgur Unlu
commited on
Commit
·
61f73d5
1
Parent(s):
4ce18b0
news fix
Browse files- news_checker.py +53 -6
- requirements.txt +3 -1
news_checker.py
CHANGED
@@ -7,6 +7,8 @@ from transformers import pipeline, AutoTokenizer, AutoModel
|
|
7 |
import torch
|
8 |
import numpy as np
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
10 |
|
11 |
load_dotenv()
|
12 |
|
@@ -25,10 +27,42 @@ class NewsChecker:
|
|
25 |
# Initialize semantic similarity model
|
26 |
self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
27 |
self.model = AutoModel.from_pretrained('distilbert-base-uncased')
|
|
|
|
|
28 |
print("Models initialized successfully")
|
29 |
except Exception as e:
|
30 |
print(f"Error initializing clients: {str(e)}")
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def get_embedding(self, text):
|
33 |
"""Get embedding for a text using DistilBERT"""
|
34 |
try:
|
@@ -80,6 +114,15 @@ class NewsChecker:
|
|
80 |
return pd.DataFrame()
|
81 |
|
82 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
# Get news from the last 7 days
|
84 |
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
|
85 |
|
@@ -88,13 +131,13 @@ class NewsChecker:
|
|
88 |
if marketing_embedding is None:
|
89 |
return pd.DataFrame()
|
90 |
|
91 |
-
#
|
92 |
response = self.newsapi.get_everything(
|
93 |
-
q=
|
94 |
from_param=week_ago,
|
95 |
language='en',
|
96 |
sort_by='relevancy',
|
97 |
-
page_size=50
|
98 |
)
|
99 |
|
100 |
if response['status'] == 'ok':
|
@@ -103,9 +146,13 @@ class NewsChecker:
|
|
103 |
for article in response['articles']:
|
104 |
if not article['title'] or not article['description']:
|
105 |
continue
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# Get embedding for article
|
108 |
-
article_text = f"{article['title']} {article['description']}"
|
109 |
article_embedding = self.get_embedding(article_text)
|
110 |
|
111 |
if article_embedding is None:
|
@@ -115,7 +162,7 @@ class NewsChecker:
|
|
115 |
similarity = self.calculate_similarity(marketing_embedding, article_embedding)
|
116 |
|
117 |
# Check if article is both semantically similar and negative
|
118 |
-
if similarity > 0.
|
119 |
relevant_news.append({
|
120 |
'title': article['title'],
|
121 |
'description': article['description'],
|
|
|
7 |
import torch
|
8 |
import numpy as np
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
import spacy
|
11 |
+
import re
|
12 |
|
13 |
load_dotenv()
|
14 |
|
|
|
27 |
# Initialize semantic similarity model
|
28 |
self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
29 |
self.model = AutoModel.from_pretrained('distilbert-base-uncased')
|
30 |
+
# Load spaCy model for keyword extraction
|
31 |
+
self.nlp = spacy.load('en_core_web_sm')
|
32 |
print("Models initialized successfully")
|
33 |
except Exception as e:
|
34 |
print(f"Error initializing clients: {str(e)}")
|
35 |
+
|
36 |
+
def extract_key_terms(self, text):
|
37 |
+
"""Extract key product and topic terms from the text"""
|
38 |
+
doc = self.nlp(text)
|
39 |
+
|
40 |
+
# Extract noun phrases and product-related terms
|
41 |
+
key_terms = []
|
42 |
+
|
43 |
+
# Get noun phrases
|
44 |
+
for chunk in doc.noun_chunks:
|
45 |
+
if len(chunk.text.split()) <= 3: # Limit to phrases of 3 words or less
|
46 |
+
key_terms.append(chunk.text.lower())
|
47 |
+
|
48 |
+
# Get product-related nouns and adjectives
|
49 |
+
for token in doc:
|
50 |
+
if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms):
|
51 |
+
key_terms.append(token.text.lower())
|
52 |
+
|
53 |
+
# Clean terms
|
54 |
+
cleaned_terms = []
|
55 |
+
for term in key_terms:
|
56 |
+
# Remove common marketing words
|
57 |
+
if term not in ['introduction', 'collection', 'products', 'items', 'things']:
|
58 |
+
# Clean the term
|
59 |
+
cleaned = re.sub(r'[^\w\s-]', '', term)
|
60 |
+
cleaned = cleaned.strip()
|
61 |
+
if cleaned and len(cleaned) > 2: # Only keep terms longer than 2 characters
|
62 |
+
cleaned_terms.append(cleaned)
|
63 |
+
|
64 |
+
return list(set(cleaned_terms)) # Remove duplicates
|
65 |
+
|
66 |
def get_embedding(self, text):
|
67 |
"""Get embedding for a text using DistilBERT"""
|
68 |
try:
|
|
|
114 |
return pd.DataFrame()
|
115 |
|
116 |
try:
|
117 |
+
# Extract key terms from marketing text
|
118 |
+
key_terms = self.extract_key_terms(marketing_text)
|
119 |
+
if not key_terms:
|
120 |
+
return pd.DataFrame()
|
121 |
+
|
122 |
+
# Create search query from key terms
|
123 |
+
search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]]) # Use top 5 terms
|
124 |
+
print(f"Searching news with query: {search_query}")
|
125 |
+
|
126 |
# Get news from the last 7 days
|
127 |
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
|
128 |
|
|
|
131 |
if marketing_embedding is None:
|
132 |
return pd.DataFrame()
|
133 |
|
134 |
+
# Search news with the extracted terms
|
135 |
response = self.newsapi.get_everything(
|
136 |
+
q=search_query,
|
137 |
from_param=week_ago,
|
138 |
language='en',
|
139 |
sort_by='relevancy',
|
140 |
+
page_size=50
|
141 |
)
|
142 |
|
143 |
if response['status'] == 'ok':
|
|
|
146 |
for article in response['articles']:
|
147 |
if not article['title'] or not article['description']:
|
148 |
continue
|
149 |
+
|
150 |
+
# Check if the article contains any of our key terms
|
151 |
+
article_text = f"{article['title'].lower()} {article['description'].lower()}"
|
152 |
+
if not any(term in article_text for term in key_terms):
|
153 |
+
continue
|
154 |
|
155 |
# Get embedding for article
|
|
|
156 |
article_embedding = self.get_embedding(article_text)
|
157 |
|
158 |
if article_embedding is None:
|
|
|
162 |
similarity = self.calculate_similarity(marketing_embedding, article_embedding)
|
163 |
|
164 |
# Check if article is both semantically similar and negative
|
165 |
+
if similarity > 0.6 and self.is_negative_news(article['title'], article['description']):
|
166 |
relevant_news.append({
|
167 |
'title': article['title'],
|
168 |
'description': article['description'],
|
requirements.txt
CHANGED
@@ -9,4 +9,6 @@ pandas==2.1.4
|
|
9 |
numpy==1.24.3
|
10 |
requests==2.31.0
|
11 |
python-dotenv==1.0.0
|
12 |
-
pyspellchecker==0.7.2
|
|
|
|
|
|
9 |
numpy==1.24.3
|
10 |
requests==2.31.0
|
11 |
python-dotenv==1.0.0
|
12 |
+
pyspellchecker==0.7.2
|
13 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
|
14 |
+
spacy==3.7.2
|