from datetime import date, timedelta from functools import partial from time import sleep from calendar import monthrange import pandas as pd from pytrends.exceptions import ResponseError from pytrends.request import TrendReq def get_last_date_of_month(year: int, month: int) -> date: """Given a year and a month returns an instance of the date class containing the last day of the corresponding month. Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python """ return date(year, month, monthrange(year, month)[1]) def convert_dates_to_timeframe(start: date, stop: date) -> str: """Given two dates, returns a stringified version of the interval between the two dates which is used to retrieve data for a specific time frame from Google Trends. """ return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}" def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame: """Attempts to fecth data and retries in case of a ResponseError.""" attempts, fetched = 0, False while not fetched: try: build_payload(timeframe=timeframe) except ResponseError as err: print(err) print(f'Trying again in {60 + 5 * attempts} seconds.') sleep(60 + 5 * attempts) attempts += 1 if attempts > 3: print('Failed after 3 attemps, abort fetching.') break else: fetched = True return pytrends.interest_over_time() def get_daily_data(word: str, start_year: int, start_mon: int, stop_year: int, stop_mon: int, geo: str = 'US', verbose: bool = True, wait_time: float = 5.0) -> pd.DataFrame: """Given a word, fetches daily search volume data from Google Trends and returns results in a pandas DataFrame. Details: Due to the way Google Trends scales and returns data, special care needs to be taken to make the daily data comparable over different months. To do that, we download daily data on a month by month basis, and also monthly data. The monthly data is downloaded in one go, so that the monthly values are comparable amongst themselves and can be used to scale the daily data. The daily data is scaled by multiplying the daily value by the monthly search volume divided by 100. For a more detailed explanation see http://bit.ly/trendsscaling Args: word (str): Word to fetch daily data for. start_year (int): the start year start_mon (int): start 1st day of the month stop_year (int): the end year stop_mon (int): end at the last day of the month geo (str): geolocation verbose (bool): If True, then prints the word and current time frame we are fecthing the data for. Returns: complete (pd.DataFrame): Contains 4 columns. The column named after the word argument contains the daily search volume already scaled and comparable through time. The column f'{word}_unscaled' is the original daily data fetched month by month, and it is not comparable across different months (but is comparable within a month). The column f'{word}_monthly' contains the original monthly data fetched at once. The values in this column have been backfilled so that there are no NaN present. The column 'scale' contains the scale used to obtain the scaled daily data. """ # Set up start and stop dates start_date = date(start_year, start_mon, 1) stop_date = get_last_date_of_month(stop_year, stop_mon) # Start pytrends for US region pytrends = TrendReq(hl='en-US', tz=360) # Initialize build_payload with the word we need data for build_payload = partial(pytrends.build_payload, kw_list=[word], cat=0, geo=geo, gprop='') # Obtain monthly data for all months in years [start_year, stop_year] monthly = _fetch_data(pytrends, build_payload, convert_dates_to_timeframe(start_date, stop_date)) # Get daily data, month by month results = {} # if a timeout or too many requests error occur we need to adjust wait time current = start_date while current < stop_date: last_date_of_month = get_last_date_of_month(current.year, current.month) timeframe = convert_dates_to_timeframe(current, last_date_of_month) if verbose: print(f'{word}:{timeframe}') results[current] = _fetch_data(pytrends, build_payload, timeframe) current = last_date_of_month + timedelta(days=1) sleep(wait_time) # don't go too fast or Google will send 429s daily = pd.concat(results.values()).drop(columns=['isPartial']) complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly') # Scale daily data by monthly weights so the data is comparable complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values complete['scale'] = complete[f'{word}_monthly'] / 100 complete[word] = complete[f'{word}_unscaled'] * complete.scale return complete