{ "cells": [ { "cell_type": "raw", "metadata": { "id": "uIxcPJeuGGAF" }, "source": [ "---\n", "title: Newsletter Helper\n", "description: Follow the instructions on screen\n", "show-code: false\n", "params:\n", " feed_keywords:\n", " label: Sources\n", " input: select\n", " value: ['a16z.com/',\n", " 'sequoiacap.com/article',\n", " 'zettavp.com/playbook/',\n", " 'atomico.com/insights/',\n", " 'nt-z.ro/',\n", " 'accel.com/noteworthy',\n", " 'felicis.com/',\n", " 'scalevp.com/blog/',\n", " 'redpoint.com/start/',\n", " '83north.com/',\n", " 'bvp.com/atlas/']\n", " choices: ['a16z.com/',\n", " 'sequoiacap.com/article',\n", " 'zettavp.com/playbook/',\n", " 'atomico.com/insights/',\n", " 'nt-z.ro/',\n", " 'accel.com/noteworthy',\n", " 'felicis.com/',\n", " 'scalevp.com/blog/',\n", " 'redpoint.com/start/',\n", " '83north.com/',\n", " 'bvp.com/atlas/']\n", " multi: True\n", " feed_age:\n", " label: How old?\n", " input: select\n", " value: '7 days'\n", " choices: ['7 days', '14 days', '30 days']\n", " multi: False\n", "---" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "pfJ5NpqjCT1U" }, "outputs": [], "source": [ "feed_keywords = ['a16z.com/',\n", " 'sequoiacap.com/article',\n", " 'zettavp.com/playbook/',\n", " 'atomico.com/insights/',\n", " 'nt-z.ro/',\n", " 'accel.com/noteworthy',\n", " 'felicis.com/',\n", " 'scalevp.com/blog/',\n", " 'redpoint.com/start/',\n", " '83north.com/',\n", " 'bvp.com/atlas/']\n", "feed_age = '28 days'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "mEOS4asyGGAI" }, "outputs": [], "source": [ "keywords = [\"Electro mobility\",\n", " \"Batteries \",\n", " \"Battery Management systems\",\n", " \"Lidars\",\n", " \"RADARS\",\n", " \"AI\",\n", " \"Industrial AI\",\n", " \"Transportation\",\n", " \"Mobility\",\n", " \"Climate Tech\",\n", " \"Sustainable grid\",\n", " \"Sensor fusion\",\n", " \"Computer vision\",\n", " \"Data Analytics\",\n", " \"Digital Twins\",\n", " \"Automotive Cybersecurity\",\n", " \"Logistics\",\n", " \"Ports\",\n", " \"Construction sites\",\n", " \"Mines\",\n", " \"Quarries\",\n", " \"Trucks\",\n", " \"Power train\",\n", " \"Software defined vehicle\"]\n", "\n", "feed = \"https://www.rssground.com/p/Newsletter\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "WMswc6FCGR9T" }, "outputs": [], "source": [ "#!pip install keybert\n", "#!pip install feedparser\n", "#!pip install keyphrase_vectorizers\n", "#!pip install sentence-transformers" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "Ig5nSCbI6yuL" }, "outputs": [], "source": [ "from keybert import KeyBERT\n", "import pandas as pd\n", "from keyphrase_vectorizers import KeyphraseCountVectorizer\n", "from sentence_transformers import SentenceTransformer\n", "import numpy as np\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "import feedparser\n", "import requests\n", "from bs4 import BeautifulSoup\n", "from openpyxl import Workbook\n", "import time\n", "import pickle\n", "import os\n", "from tqdm import tqdm\n", "from concurrent.futures import ThreadPoolExecutor\n", "#from functools import lru_cache\n", "\n", "# Define function to extract keywords from the HTML body using the YAKE keyword extractor\n", "def extract_keyphrases(text, kw_model, vectorizer, embedding_model):\n", " kph = [kw for kw, score in kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', vectorizer=vectorizer, use_mmr=True)]\n", " keyphrase_embeddings = embedding_model.encode(kph)\n", " return kph, keyphrase_embeddings\n", "\n", "def get_similarity_scores(keyword_embeddings, keyphrase_embeddings):\n", " similarity_scores = cosine_similarity(keyphrase_embeddings, keyword_embeddings).max(axis=1).astype(str).tolist()\n", " similarity_max = cosine_similarity(keyphrase_embeddings, keyword_embeddings).flatten().max().astype(str)\n", " return similarity_scores, similarity_max\n", "\n", "# Define function to get the redirected URL (if any) for a given URL\n", "def get_redirected_url(url_record, headers, expected_codes=(301, 302, 303, 307), timeout=60):\n", " try:\n", " res = requests.head(url_record['url'], headers=headers, timeout=timeout)\n", " if res.status_code in expected_codes:\n", " url_record['url'] = res.headers['location']\n", " elif res.status_code == 200:\n", " url_record['url'] = url_record['url']\n", " else:\n", " print(f\"Retrieving {url_record['url']} failed: Expected {expected_codes}, but received {res.status_code}: {res.reason}\")\n", " except requests.exceptions.Timeout:\n", " print(f\"\\nRequest timed out for {url_record['url']}\")\n", " return url_record\n", " except:\n", " return url_record\n", "\n", " return url_record\n", "\n", "# Define function to get the HTML body of a given URL\n", "def get_html_body(url, headers):\n", " try:\n", " response = requests.get(url, headers=headers, timeout=10)\n", " html = response.content\n", " soup = BeautifulSoup(html, 'html.parser')\n", " return soup.body.get_text()\n", " except:\n", " return ''\n", "\n", "# Define function to write data to the Excel sheet\n", "def write_data_to_excel(url_dict, filename):\n", " # Create a new Excel workbook and worksheet\n", " workbook = Workbook()\n", " worksheet = workbook.active\n", " worksheet.title = 'RSS Feeds'\n", "\n", " # Write the headers for the Excel sheet\n", " worksheet.cell(row=1, column=1, value='Feed Name')\n", " worksheet.cell(row=1, column=2, value='URL')\n", " worksheet.cell(row=1, column=3, value='Updated')\n", " worksheet.cell(row=1, column=4, value='Keyphrases')\n", " worksheet.cell(row=1, column=5, value='Similarity to supplied keywords')\n", " worksheet.cell(row=1, column=6, value='Similarity (max)')\n", " worksheet.cell(row=1, column=7, value='HTML Body')\n", "\n", " # Loop over the unique URLs and write them to the Excel sheet\n", " row_num = 2\n", " for url, data in url_dict.items():\n", " worksheet.cell(row=row_num, column=1, value=data['feed_name'])\n", " worksheet.cell(row=row_num, column=2, value=url)\n", " worksheet.cell(row=row_num, column=3, value=data['updated'])\n", " worksheet.cell(row=row_num, column=4, value=data['keyphrases'])\n", " worksheet.cell(row=row_num, column=5, value=data['similarity'])\n", " worksheet.cell(row=row_num, column=6, value=data['similarity_max'])\n", " worksheet.cell(row=row_num, column=7, value=data['html_body'])\n", "\n", " row_num += 1\n", "\n", " worksheet.freeze_panes = 'A2'\n", "\n", " # Set the number format for column A, except the first row\n", " for row in worksheet.iter_rows(min_row=2, min_col=3, max_col=3):\n", " for cell in row:\n", " cell.number_format = 'mm/dd/yyyy hh:mm:ss'\n", "\n", " # Save the Excel workbook\n", " workbook.save(filename)\n", "\n", " # Print confirmation message\n", " #print(f'RSS output written to excel sheet: {filename}')\n", "\n", "def remaining_entries_from_dict(filename, dictionary):\n", " pickle_data = {}\n", " if os.path.exists(filename):\n", " with open(filename, 'rb') as f:\n", " pickle_data = pickle.load(f)\n", " return list(set(dictionary.keys()) - set(pickle_data.keys()))\n", "\n", "def process_url(url):\n", " global url_dict\n", " \n", " #body = get_html_body(url, headers)\n", " #kph,keyphrase_embeddings = extract_keyphrases(body, kw_model, vectorizer, embedding_model)\n", " #similarity, similarity_max = get_similarity_scores(keyword_embeddings, keyphrase_embeddings)\n", "\n", " #url_dict[url]['keyphrases'] = ', '.join(kph)\n", " #url_dict[url]['similarity'] = ', '.join(similarity)\n", " #url_dict[url]['similarity_max'] = similarity_max\n", " #url_dict[url]['html_body'] = body\n", " \n", " url_dict[url]['keyphrases'] = ''\n", " url_dict[url]['similarity'] = ''\n", " url_dict[url]['similarity_max'] = ''\n", " url_dict[url]['html_body'] = \"Skipping this part, to speed up the process\"\n", "\n", " # Store temporary results to disk\n", " #with open(\"retrieved_urls.pkl\", 'wb') as f:\n", " # pickle.dump(url_dict, f)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "5cHnJQDSDy1Q" }, "outputs": [], "source": [ "import pprint\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "from tqdm import tqdm\n", "from datetime import datetime\n", "import nltk\n", "\n", "\n", "# Initialize the SentenceTransformer model\n", "kw_model = KeyBERT('distilbert-base-nli-mean-tokens')\n", "vectorizer = KeyphraseCountVectorizer()\n", "embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')\n", "nltk.download('stopwords', quiet=True)\n", "\n", "# Initialize variables\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'\n", "}\n", "keyword_embeddings = embedding_model.encode(keywords) # Encode keywords using the embedding model\n", "\n", "def filter_strings(lst1, lst2):\n", " \"\"\"\n", " Filters the list `lst2` and returns only the elements that have any of the elements of `lst1` as a substring.\n", " \n", " Args:\n", " lst1 (list): The list of substrings to match against.\n", " lst2 (list): The list of strings to filter.\n", "\n", " Returns:\n", " list: A new list containing the filtered elements from `lst2`.\n", "\n", " Examples:\n", " >>> lst1 = ['apple', 'banana', 'orange']\n", " >>> lst2 = ['apple pie', 'banana bread', 'cherry pie', 'orange juice']\n", " >>> filter_strings(lst1, lst2)\n", " ['apple pie', 'banana bread', 'orange juice']\n", " \"\"\"\n", " filtered_lst2 = [s for s in lst2 if any(substring in s for substring in lst1)]\n", " return filtered_lst2\n", "\n", "\n", "def read_feeds(rss_feed, how_old):\n", " global urls\n", " import sys\n", " import io\n", " import re\n", " from datetime import datetime, timedelta\n", " import pytz\n", "\n", " old_stdout = sys.stdout\n", " sys.stdout = mystdout = io.StringIO()\n", "\n", " # Loop over the RSS feeds and keywords\n", " urls_temp = []\n", " urls = []\n", "\n", " # Get the desired timezone\n", " timezone = pytz.timezone('Europe/Stockholm') # Replace 'Your_Timezone_Here' with the desired timezone\n", "\n", " # Calculate the age with timezone\n", " feed_item_age_minimum = datetime.now(timezone) - timedelta(days=int(how_old.split()[0]))\n", "\n", " feed = feedparser.parse(rss_feed)\n", " for entry in tqdm(feed.entries, total=len(feed.entries), file=sys.stdout, bar_format='\\tReading feed entries: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n", " soup = BeautifulSoup(entry.summary, 'html.parser')\n", " updated = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')\n", " if re.search(r'@([^ ]+)', entry.title):\n", " feed_name = re.search(r'@([^ ]+)', entry.title).group(1)\n", " else:\n", " feed_name = ''\n", " if updated > feed_item_age_minimum:\n", " urls_temp.extend([{'url': link.get('href'), 'updated': updated, 'feed_name': feed_name} for link in soup.find_all('a')])\n", "\n", " with ThreadPoolExecutor(max_workers=4) as executor:\n", " futures = [executor.submit(get_redirected_url, url, headers) for url in urls_temp]\n", " for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Checking URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n", " urls.append(future.result())\n", "\n", " sys.stdout = old_stdout\n", " return mystdout.getvalue()\n", "\n", "def read_process_urls():\n", " import sys\n", " import io\n", " from datetime import datetime, timedelta\n", " old_stdout = sys.stdout\n", " sys.stdout = mystdout = io.StringIO()\n", "\n", " global urls\n", " global url_dict\n", "\n", " #print(f\"Urls: {urls}\")\n", " url_dict = {}\n", " for item in filter_strings(feed_keywords, urls):\n", " feed_name = item['feed_name']\n", " updated = item['updated']\n", " url = item['url']\n", "\n", " import pprint\n", " pprint.pprint(url)\n", " if url not in url_dict.keys():\n", " url_dict[url] = {'updated': updated, 'feed_name': feed_name}\n", " else:\n", " if url_dict[url]['updated'] > updated:\n", " url_dict[url]['updated'] = updated\n", "\n", " start_parallel_loop_time = time.time()\n", " results = []\n", " with ThreadPoolExecutor(max_workers=4) as executor:\n", " futures = [executor.submit(process_url, url) for url in url_dict.keys()]#remaining_entries_from_dict(\"retrieved_urls.pkl\", url_dict)]\n", " for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Reading URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n", " results.append(future.result())\n", " #print(f\"Parallel URL processing: {time.time() - start_parallel_loop_time:.3f} seconds\")\n", " print(f\"Total links processed: {len(url_dict.keys())}\")\n", "\n", " #with open(\"retrieved_urls.pkl\", 'wb') as f:\n", " # pickle.dump(url_dict, f)\n", "\n", " # Write dataset to the Excel sheet\n", " write_data_to_excel(url_dict, 'newsletter_results.xlsx')\n", "\n", " sys.stdout = old_stdout\n", " return mystdout.getvalue()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "FNR1jfm-jsgb" }, "outputs": [], "source": [ "from ipywidgets import HTML\n", "\n", "read_feeds(feed, feed_age)\n", "display(HTML(f\"Total links examined: {len(urls)}\"))\n", "\n", "read_process_urls()\n", "display(HTML(f\"Relevant links found: {len(url_dict.keys())}\"))\n", "display(HTML(f\"------------------------------\"))\n", "\n", "for url in url_dict.keys():\n", " #print(url)\n", " display(HTML(f\"{url}\"))\n" ] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 0 }