Ozgur Unlu commited on
Commit
61f73d5
·
1 Parent(s): 4ce18b0
Files changed (2) hide show
  1. news_checker.py +53 -6
  2. requirements.txt +3 -1
news_checker.py CHANGED
@@ -7,6 +7,8 @@ from transformers import pipeline, AutoTokenizer, AutoModel
7
  import torch
8
  import numpy as np
9
  from sklearn.metrics.pairwise import cosine_similarity
 
 
10
 
11
  load_dotenv()
12
 
@@ -25,10 +27,42 @@ class NewsChecker:
25
  # Initialize semantic similarity model
26
  self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
27
  self.model = AutoModel.from_pretrained('distilbert-base-uncased')
 
 
28
  print("Models initialized successfully")
29
  except Exception as e:
30
  print(f"Error initializing clients: {str(e)}")
31
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def get_embedding(self, text):
33
  """Get embedding for a text using DistilBERT"""
34
  try:
@@ -80,6 +114,15 @@ class NewsChecker:
80
  return pd.DataFrame()
81
 
82
  try:
 
 
 
 
 
 
 
 
 
83
  # Get news from the last 7 days
84
  week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
85
 
@@ -88,13 +131,13 @@ class NewsChecker:
88
  if marketing_embedding is None:
89
  return pd.DataFrame()
90
 
91
- # Start with a broader search to get potentially relevant articles
92
  response = self.newsapi.get_everything(
93
- q='news', # Broad search term
94
  from_param=week_ago,
95
  language='en',
96
  sort_by='relevancy',
97
- page_size=50 # Get more articles for better matching
98
  )
99
 
100
  if response['status'] == 'ok':
@@ -103,9 +146,13 @@ class NewsChecker:
103
  for article in response['articles']:
104
  if not article['title'] or not article['description']:
105
  continue
 
 
 
 
 
106
 
107
  # Get embedding for article
108
- article_text = f"{article['title']} {article['description']}"
109
  article_embedding = self.get_embedding(article_text)
110
 
111
  if article_embedding is None:
@@ -115,7 +162,7 @@ class NewsChecker:
115
  similarity = self.calculate_similarity(marketing_embedding, article_embedding)
116
 
117
  # Check if article is both semantically similar and negative
118
- if similarity > 0.5 and self.is_negative_news(article['title'], article['description']):
119
  relevant_news.append({
120
  'title': article['title'],
121
  'description': article['description'],
 
7
  import torch
8
  import numpy as np
9
  from sklearn.metrics.pairwise import cosine_similarity
10
+ import spacy
11
+ import re
12
 
13
  load_dotenv()
14
 
 
27
  # Initialize semantic similarity model
28
  self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
29
  self.model = AutoModel.from_pretrained('distilbert-base-uncased')
30
+ # Load spaCy model for keyword extraction
31
+ self.nlp = spacy.load('en_core_web_sm')
32
  print("Models initialized successfully")
33
  except Exception as e:
34
  print(f"Error initializing clients: {str(e)}")
35
+
36
+ def extract_key_terms(self, text):
37
+ """Extract key product and topic terms from the text"""
38
+ doc = self.nlp(text)
39
+
40
+ # Extract noun phrases and product-related terms
41
+ key_terms = []
42
+
43
+ # Get noun phrases
44
+ for chunk in doc.noun_chunks:
45
+ if len(chunk.text.split()) <= 3: # Limit to phrases of 3 words or less
46
+ key_terms.append(chunk.text.lower())
47
+
48
+ # Get product-related nouns and adjectives
49
+ for token in doc:
50
+ if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms):
51
+ key_terms.append(token.text.lower())
52
+
53
+ # Clean terms
54
+ cleaned_terms = []
55
+ for term in key_terms:
56
+ # Remove common marketing words
57
+ if term not in ['introduction', 'collection', 'products', 'items', 'things']:
58
+ # Clean the term
59
+ cleaned = re.sub(r'[^\w\s-]', '', term)
60
+ cleaned = cleaned.strip()
61
+ if cleaned and len(cleaned) > 2: # Only keep terms longer than 2 characters
62
+ cleaned_terms.append(cleaned)
63
+
64
+ return list(set(cleaned_terms)) # Remove duplicates
65
+
66
  def get_embedding(self, text):
67
  """Get embedding for a text using DistilBERT"""
68
  try:
 
114
  return pd.DataFrame()
115
 
116
  try:
117
+ # Extract key terms from marketing text
118
+ key_terms = self.extract_key_terms(marketing_text)
119
+ if not key_terms:
120
+ return pd.DataFrame()
121
+
122
+ # Create search query from key terms
123
+ search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]]) # Use top 5 terms
124
+ print(f"Searching news with query: {search_query}")
125
+
126
  # Get news from the last 7 days
127
  week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
128
 
 
131
  if marketing_embedding is None:
132
  return pd.DataFrame()
133
 
134
+ # Search news with the extracted terms
135
  response = self.newsapi.get_everything(
136
+ q=search_query,
137
  from_param=week_ago,
138
  language='en',
139
  sort_by='relevancy',
140
+ page_size=50
141
  )
142
 
143
  if response['status'] == 'ok':
 
146
  for article in response['articles']:
147
  if not article['title'] or not article['description']:
148
  continue
149
+
150
+ # Check if the article contains any of our key terms
151
+ article_text = f"{article['title'].lower()} {article['description'].lower()}"
152
+ if not any(term in article_text for term in key_terms):
153
+ continue
154
 
155
  # Get embedding for article
 
156
  article_embedding = self.get_embedding(article_text)
157
 
158
  if article_embedding is None:
 
162
  similarity = self.calculate_similarity(marketing_embedding, article_embedding)
163
 
164
  # Check if article is both semantically similar and negative
165
+ if similarity > 0.6 and self.is_negative_news(article['title'], article['description']):
166
  relevant_news.append({
167
  'title': article['title'],
168
  'description': article['description'],
requirements.txt CHANGED
@@ -9,4 +9,6 @@ pandas==2.1.4
9
  numpy==1.24.3
10
  requests==2.31.0
11
  python-dotenv==1.0.0
12
- pyspellchecker==0.7.2
 
 
 
9
  numpy==1.24.3
10
  requests==2.31.0
11
  python-dotenv==1.0.0
12
+ pyspellchecker==0.7.2
13
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
14
+ spacy==3.7.2