Spaces:

tensora
/

ai-journalist-demo

Runtime error

App Files Files Community

ai-journalist-demo / pytrends /dailydata.py

tensorwitz

Added modified PyTrends library

889f06f 7 months ago

raw

history blame

5.31 kB

	from datetime import date, timedelta
	from functools import partial
	from time import sleep
	from calendar import monthrange

	import pandas as pd

	from pytrends.exceptions import ResponseError
	from pytrends.request import TrendReq


	def get_last_date_of_month(year: int, month: int) -> date:
	"""Given a year and a month returns an instance of the date class
	containing the last day of the corresponding month.

	Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
	"""
	return date(year, month, monthrange(year, month)[1])


	def convert_dates_to_timeframe(start: date, stop: date) -> str:
	"""Given two dates, returns a stringified version of the interval between
	the two dates which is used to retrieve data for a specific time frame
	from Google Trends.
	"""
	return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"


	def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
	"""Attempts to fecth data and retries in case of a ResponseError."""
	attempts, fetched = 0, False
	while not fetched:
	try:
	build_payload(timeframe=timeframe)
	except ResponseError as err:
	print(err)
	print(f'Trying again in {60 + 5 * attempts} seconds.')
	sleep(60 + 5 * attempts)
	attempts += 1
	if attempts > 3:
	print('Failed after 3 attemps, abort fetching.')
	break
	else:
	fetched = True
	return pytrends.interest_over_time()


	def get_daily_data(word: str,
	start_year: int,
	start_mon: int,
	stop_year: int,
	stop_mon: int,
	geo: str = 'US',
	verbose: bool = True,
	wait_time: float = 5.0) -> pd.DataFrame:
	"""Given a word, fetches daily search volume data from Google Trends and
	returns results in a pandas DataFrame.

	Details: Due to the way Google Trends scales and returns data, special
	care needs to be taken to make the daily data comparable over different
	months. To do that, we download daily data on a month by month basis,
	and also monthly data. The monthly data is downloaded in one go, so that
	the monthly values are comparable amongst themselves and can be used to
	scale the daily data. The daily data is scaled by multiplying the daily
	value by the monthly search volume divided by 100.
	For a more detailed explanation see http://bit.ly/trendsscaling

	Args:
	word (str): Word to fetch daily data for.
	start_year (int): the start year
	start_mon (int): start 1st day of the month
	stop_year (int): the end year
	stop_mon (int): end at the last day of the month
	geo (str): geolocation
	verbose (bool): If True, then prints the word and current time frame
	we are fecthing the data for.

	Returns:
	complete (pd.DataFrame): Contains 4 columns.
	The column named after the word argument contains the daily search
	volume already scaled and comparable through time.
	The column f'{word}_unscaled' is the original daily data fetched
	month by month, and it is not comparable across different months
	(but is comparable within a month).
	The column f'{word}_monthly' contains the original monthly data
	fetched at once. The values in this column have been backfilled
	so that there are no NaN present.
	The column 'scale' contains the scale used to obtain the scaled
	daily data.
	"""

	# Set up start and stop dates
	start_date = date(start_year, start_mon, 1)
	stop_date = get_last_date_of_month(stop_year, stop_mon)

	# Start pytrends for US region
	pytrends = TrendReq(hl='en-US', tz=360)
	# Initialize build_payload with the word we need data for
	build_payload = partial(pytrends.build_payload,
	kw_list=[word], cat=0, geo=geo, gprop='')

	# Obtain monthly data for all months in years [start_year, stop_year]
	monthly = _fetch_data(pytrends, build_payload,
	convert_dates_to_timeframe(start_date, stop_date))

	# Get daily data, month by month
	results = {}
	# if a timeout or too many requests error occur we need to adjust wait time
	current = start_date
	while current < stop_date:
	last_date_of_month = get_last_date_of_month(current.year, current.month)
	timeframe = convert_dates_to_timeframe(current, last_date_of_month)
	if verbose:
	print(f'{word}:{timeframe}')
	results[current] = _fetch_data(pytrends, build_payload, timeframe)
	current = last_date_of_month + timedelta(days=1)
	sleep(wait_time) # don't go too fast or Google will send 429s

	daily = pd.concat(results.values()).drop(columns=['isPartial'])
	complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')

	# Scale daily data by monthly weights so the data is comparable
	complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values
	complete['scale'] = complete[f'{word}_monthly'] / 100
	complete[word] = complete[f'{word}_unscaled'] * complete.scale

	return complete