Spaces:

ozgurunlu
/

m-check

Sleeping

App Files Files Community

Ozgur Unlu commited on Nov 6, 2024

Commit

61f73d5

1 Parent(s): 4ce18b0

news fix

Browse files

Files changed (2) hide show

news_checker.py +53 -6
requirements.txt +3 -1

news_checker.py CHANGED Viewed

@@ -7,6 +7,8 @@ from transformers import pipeline, AutoTokenizer, AutoModel
 import torch
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
 load_dotenv()
@@ -25,10 +27,42 @@ class NewsChecker:
             # Initialize semantic similarity model
             self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
             self.model = AutoModel.from_pretrained('distilbert-base-uncased')
             print("Models initialized successfully")
         except Exception as e:
             print(f"Error initializing clients: {str(e)}")
     def get_embedding(self, text):
         """Get embedding for a text using DistilBERT"""
         try:
@@ -80,6 +114,15 @@ class NewsChecker:
             return pd.DataFrame()
         try:
             # Get news from the last 7 days
             week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
@@ -88,13 +131,13 @@ class NewsChecker:
             if marketing_embedding is None:
                 return pd.DataFrame()
-            # Start with a broader search to get potentially relevant articles
             response = self.newsapi.get_everything(
-                q='news',  # Broad search term
                 from_param=week_ago,
                 language='en',
                 sort_by='relevancy',
-                page_size=50  # Get more articles for better matching
             )
             if response['status'] == 'ok':
@@ -103,9 +146,13 @@ class NewsChecker:
                 for article in response['articles']:
                     if not article['title'] or not article['description']:
                         continue
                     # Get embedding for article
-                    article_text = f"{article['title']} {article['description']}"
                     article_embedding = self.get_embedding(article_text)
                     if article_embedding is None:
@@ -115,7 +162,7 @@ class NewsChecker:
                     similarity = self.calculate_similarity(marketing_embedding, article_embedding)
                     # Check if article is both semantically similar and negative
-                    if similarity > 0.5 and self.is_negative_news(article['title'], article['description']):
                         relevant_news.append({
                             'title': article['title'],
                             'description': article['description'],

 import torch
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
+import spacy
+import re
 load_dotenv()
             # Initialize semantic similarity model
             self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
             self.model = AutoModel.from_pretrained('distilbert-base-uncased')
+            # Load spaCy model for keyword extraction
+            self.nlp = spacy.load('en_core_web_sm')
             print("Models initialized successfully")
         except Exception as e:
             print(f"Error initializing clients: {str(e)}")
+    def extract_key_terms(self, text):
+        """Extract key product and topic terms from the text"""
+        doc = self.nlp(text)
+        # Extract noun phrases and product-related terms
+        key_terms = []
+        # Get noun phrases
+        for chunk in doc.noun_chunks:
+            if len(chunk.text.split()) <= 3:  # Limit to phrases of 3 words or less
+                key_terms.append(chunk.text.lower())
+        # Get product-related nouns and adjectives
+        for token in doc:
+            if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms):
+                key_terms.append(token.text.lower())
+        # Clean terms
+        cleaned_terms = []
+        for term in key_terms:
+            # Remove common marketing words
+            if term not in ['introduction', 'collection', 'products', 'items', 'things']:
+                # Clean the term
+                cleaned = re.sub(r'[^\w\s-]', '', term)
+                cleaned = cleaned.strip()
+                if cleaned and len(cleaned) > 2:  # Only keep terms longer than 2 characters
+                    cleaned_terms.append(cleaned)
+        return list(set(cleaned_terms))  # Remove duplicates
     def get_embedding(self, text):
         """Get embedding for a text using DistilBERT"""
         try:
             return pd.DataFrame()
         try:
+            # Extract key terms from marketing text
+            key_terms = self.extract_key_terms(marketing_text)
+            if not key_terms:
+                return pd.DataFrame()
+            # Create search query from key terms
+            search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]])  # Use top 5 terms
+            print(f"Searching news with query: {search_query}")
             # Get news from the last 7 days
             week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
             if marketing_embedding is None:
                 return pd.DataFrame()
+            # Search news with the extracted terms
             response = self.newsapi.get_everything(
+                q=search_query,
                 from_param=week_ago,
                 language='en',
                 sort_by='relevancy',
+                page_size=50
             )
             if response['status'] == 'ok':
                 for article in response['articles']:
                     if not article['title'] or not article['description']:
                         continue
+                    # Check if the article contains any of our key terms
+                    article_text = f"{article['title'].lower()} {article['description'].lower()}"
+                    if not any(term in article_text for term in key_terms):
+                        continue
                     # Get embedding for article
                     article_embedding = self.get_embedding(article_text)
                     if article_embedding is None:
                     similarity = self.calculate_similarity(marketing_embedding, article_embedding)
                     # Check if article is both semantically similar and negative
+                    if similarity > 0.6 and self.is_negative_news(article['title'], article['description']):
                         relevant_news.append({
                             'title': article['title'],
                             'description': article['description'],

requirements.txt CHANGED Viewed

@@ -9,4 +9,6 @@ pandas==2.1.4
 numpy==1.24.3
 requests==2.31.0
 python-dotenv==1.0.0
-pyspellchecker==0.7.2

 numpy==1.24.3
 requests==2.31.0
 python-dotenv==1.0.0
+pyspellchecker==0.7.2
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
+spacy==3.7.2