tensorwitz's picture
Added modified PyTrends library
889f06f
raw
history blame
5.31 kB
from datetime import date, timedelta
from functools import partial
from time import sleep
from calendar import monthrange
import pandas as pd
from pytrends.exceptions import ResponseError
from pytrends.request import TrendReq
def get_last_date_of_month(year: int, month: int) -> date:
"""Given a year and a month returns an instance of the date class
containing the last day of the corresponding month.
Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
"""
return date(year, month, monthrange(year, month)[1])
def convert_dates_to_timeframe(start: date, stop: date) -> str:
"""Given two dates, returns a stringified version of the interval between
the two dates which is used to retrieve data for a specific time frame
from Google Trends.
"""
return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"
def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
"""Attempts to fecth data and retries in case of a ResponseError."""
attempts, fetched = 0, False
while not fetched:
try:
build_payload(timeframe=timeframe)
except ResponseError as err:
print(err)
print(f'Trying again in {60 + 5 * attempts} seconds.')
sleep(60 + 5 * attempts)
attempts += 1
if attempts > 3:
print('Failed after 3 attemps, abort fetching.')
break
else:
fetched = True
return pytrends.interest_over_time()
def get_daily_data(word: str,
start_year: int,
start_mon: int,
stop_year: int,
stop_mon: int,
geo: str = 'US',
verbose: bool = True,
wait_time: float = 5.0) -> pd.DataFrame:
"""Given a word, fetches daily search volume data from Google Trends and
returns results in a pandas DataFrame.
Details: Due to the way Google Trends scales and returns data, special
care needs to be taken to make the daily data comparable over different
months. To do that, we download daily data on a month by month basis,
and also monthly data. The monthly data is downloaded in one go, so that
the monthly values are comparable amongst themselves and can be used to
scale the daily data. The daily data is scaled by multiplying the daily
value by the monthly search volume divided by 100.
For a more detailed explanation see http://bit.ly/trendsscaling
Args:
word (str): Word to fetch daily data for.
start_year (int): the start year
start_mon (int): start 1st day of the month
stop_year (int): the end year
stop_mon (int): end at the last day of the month
geo (str): geolocation
verbose (bool): If True, then prints the word and current time frame
we are fecthing the data for.
Returns:
complete (pd.DataFrame): Contains 4 columns.
The column named after the word argument contains the daily search
volume already scaled and comparable through time.
The column f'{word}_unscaled' is the original daily data fetched
month by month, and it is not comparable across different months
(but is comparable within a month).
The column f'{word}_monthly' contains the original monthly data
fetched at once. The values in this column have been backfilled
so that there are no NaN present.
The column 'scale' contains the scale used to obtain the scaled
daily data.
"""
# Set up start and stop dates
start_date = date(start_year, start_mon, 1)
stop_date = get_last_date_of_month(stop_year, stop_mon)
# Start pytrends for US region
pytrends = TrendReq(hl='en-US', tz=360)
# Initialize build_payload with the word we need data for
build_payload = partial(pytrends.build_payload,
kw_list=[word], cat=0, geo=geo, gprop='')
# Obtain monthly data for all months in years [start_year, stop_year]
monthly = _fetch_data(pytrends, build_payload,
convert_dates_to_timeframe(start_date, stop_date))
# Get daily data, month by month
results = {}
# if a timeout or too many requests error occur we need to adjust wait time
current = start_date
while current < stop_date:
last_date_of_month = get_last_date_of_month(current.year, current.month)
timeframe = convert_dates_to_timeframe(current, last_date_of_month)
if verbose:
print(f'{word}:{timeframe}')
results[current] = _fetch_data(pytrends, build_payload, timeframe)
current = last_date_of_month + timedelta(days=1)
sleep(wait_time) # don't go too fast or Google will send 429s
daily = pd.concat(results.values()).drop(columns=['isPartial'])
complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')
# Scale daily data by monthly weights so the data is comparable
complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values
complete['scale'] = complete[f'{word}_monthly'] / 100
complete[word] = complete[f'{word}_unscaled'] * complete.scale
return complete