tensorwitz commited on
Commit
889f06f
·
1 Parent(s): b279245

Added modified PyTrends library

Browse files
pytrends/__init__.py ADDED
File without changes
pytrends/dailydata.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import date, timedelta
2
+ from functools import partial
3
+ from time import sleep
4
+ from calendar import monthrange
5
+
6
+ import pandas as pd
7
+
8
+ from pytrends.exceptions import ResponseError
9
+ from pytrends.request import TrendReq
10
+
11
+
12
+ def get_last_date_of_month(year: int, month: int) -> date:
13
+ """Given a year and a month returns an instance of the date class
14
+ containing the last day of the corresponding month.
15
+
16
+ Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
17
+ """
18
+ return date(year, month, monthrange(year, month)[1])
19
+
20
+
21
+ def convert_dates_to_timeframe(start: date, stop: date) -> str:
22
+ """Given two dates, returns a stringified version of the interval between
23
+ the two dates which is used to retrieve data for a specific time frame
24
+ from Google Trends.
25
+ """
26
+ return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"
27
+
28
+
29
+ def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
30
+ """Attempts to fecth data and retries in case of a ResponseError."""
31
+ attempts, fetched = 0, False
32
+ while not fetched:
33
+ try:
34
+ build_payload(timeframe=timeframe)
35
+ except ResponseError as err:
36
+ print(err)
37
+ print(f'Trying again in {60 + 5 * attempts} seconds.')
38
+ sleep(60 + 5 * attempts)
39
+ attempts += 1
40
+ if attempts > 3:
41
+ print('Failed after 3 attemps, abort fetching.')
42
+ break
43
+ else:
44
+ fetched = True
45
+ return pytrends.interest_over_time()
46
+
47
+
48
+ def get_daily_data(word: str,
49
+ start_year: int,
50
+ start_mon: int,
51
+ stop_year: int,
52
+ stop_mon: int,
53
+ geo: str = 'US',
54
+ verbose: bool = True,
55
+ wait_time: float = 5.0) -> pd.DataFrame:
56
+ """Given a word, fetches daily search volume data from Google Trends and
57
+ returns results in a pandas DataFrame.
58
+
59
+ Details: Due to the way Google Trends scales and returns data, special
60
+ care needs to be taken to make the daily data comparable over different
61
+ months. To do that, we download daily data on a month by month basis,
62
+ and also monthly data. The monthly data is downloaded in one go, so that
63
+ the monthly values are comparable amongst themselves and can be used to
64
+ scale the daily data. The daily data is scaled by multiplying the daily
65
+ value by the monthly search volume divided by 100.
66
+ For a more detailed explanation see http://bit.ly/trendsscaling
67
+
68
+ Args:
69
+ word (str): Word to fetch daily data for.
70
+ start_year (int): the start year
71
+ start_mon (int): start 1st day of the month
72
+ stop_year (int): the end year
73
+ stop_mon (int): end at the last day of the month
74
+ geo (str): geolocation
75
+ verbose (bool): If True, then prints the word and current time frame
76
+ we are fecthing the data for.
77
+
78
+ Returns:
79
+ complete (pd.DataFrame): Contains 4 columns.
80
+ The column named after the word argument contains the daily search
81
+ volume already scaled and comparable through time.
82
+ The column f'{word}_unscaled' is the original daily data fetched
83
+ month by month, and it is not comparable across different months
84
+ (but is comparable within a month).
85
+ The column f'{word}_monthly' contains the original monthly data
86
+ fetched at once. The values in this column have been backfilled
87
+ so that there are no NaN present.
88
+ The column 'scale' contains the scale used to obtain the scaled
89
+ daily data.
90
+ """
91
+
92
+ # Set up start and stop dates
93
+ start_date = date(start_year, start_mon, 1)
94
+ stop_date = get_last_date_of_month(stop_year, stop_mon)
95
+
96
+ # Start pytrends for US region
97
+ pytrends = TrendReq(hl='en-US', tz=360)
98
+ # Initialize build_payload with the word we need data for
99
+ build_payload = partial(pytrends.build_payload,
100
+ kw_list=[word], cat=0, geo=geo, gprop='')
101
+
102
+ # Obtain monthly data for all months in years [start_year, stop_year]
103
+ monthly = _fetch_data(pytrends, build_payload,
104
+ convert_dates_to_timeframe(start_date, stop_date))
105
+
106
+ # Get daily data, month by month
107
+ results = {}
108
+ # if a timeout or too many requests error occur we need to adjust wait time
109
+ current = start_date
110
+ while current < stop_date:
111
+ last_date_of_month = get_last_date_of_month(current.year, current.month)
112
+ timeframe = convert_dates_to_timeframe(current, last_date_of_month)
113
+ if verbose:
114
+ print(f'{word}:{timeframe}')
115
+ results[current] = _fetch_data(pytrends, build_payload, timeframe)
116
+ current = last_date_of_month + timedelta(days=1)
117
+ sleep(wait_time) # don't go too fast or Google will send 429s
118
+
119
+ daily = pd.concat(results.values()).drop(columns=['isPartial'])
120
+ complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')
121
+
122
+ # Scale daily data by monthly weights so the data is comparable
123
+ complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values
124
+ complete['scale'] = complete[f'{word}_monthly'] / 100
125
+ complete[word] = complete[f'{word}_unscaled'] * complete.scale
126
+
127
+ return complete
pytrends/exceptions.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ResponseError(Exception):
2
+ """ Something was wrong with the response from Google. """
3
+
4
+ def __init__(self, message, response):
5
+ super().__init__(message)
6
+ # pass response so it can be handled upstream
7
+ self.response = response
8
+
9
+ @classmethod
10
+ def from_response(cls, response):
11
+ message = f'The request failed: Google returned a response with code {response.status_code} and message {response.text}'
12
+ return cls(message, response)
13
+
14
+
15
+ class TooManyRequestsError(ResponseError):
16
+ """ Exception raised when the backend returns a 429 error code. """
17
+ pass
pytrends/request.py ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pandas as pd
4
+ import requests
5
+
6
+ from requests.adapters import HTTPAdapter
7
+ from requests.packages.urllib3.util.retry import Retry
8
+ from requests import status_codes
9
+
10
+ from pytrends import exceptions
11
+
12
+ from urllib.parse import quote
13
+
14
+
15
+ BASE_TRENDS_URL = 'https://trends.google.com/trends'
16
+
17
+
18
+ class TrendReq(object):
19
+ """
20
+ Google Trends API
21
+ """
22
+ GET_METHOD = 'get'
23
+ POST_METHOD = 'post'
24
+ GENERAL_URL = f'{BASE_TRENDS_URL}/api/explore'
25
+ INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multiline'
26
+ MULTIRANGE_INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multirange'
27
+ INTEREST_BY_REGION_URL = f'{BASE_TRENDS_URL}/api/widgetdata/comparedgeo'
28
+ RELATED_QUERIES_URL = f'{BASE_TRENDS_URL}/api/widgetdata/relatedsearches'
29
+ TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/hottrends/visualize/internal/data'
30
+ TOP_CHARTS_URL = f'{BASE_TRENDS_URL}/api/topcharts'
31
+ SUGGESTIONS_URL = f'{BASE_TRENDS_URL}/api/autocomplete/'
32
+ CATEGORIES_URL = f'{BASE_TRENDS_URL}/api/explore/pickers/category'
33
+ TODAY_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/dailytrends'
34
+ REALTIME_TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/realtimetrends'
35
+ TRENDS_URL = f'{BASE_TRENDS_URL}/api/trends'
36
+ ERROR_CODES = (500, 502, 504, 429)
37
+
38
+ def __init__(self, hl='en-US', tz=360, geo='', timeout=(2, 5), proxies='',
39
+ retries=0, backoff_factor=0, requests_args=None):
40
+ """
41
+ Initialize default values for params
42
+ """
43
+ # google rate limit
44
+ self.google_rl = 'You have reached your quota limit. Please try again later.'
45
+ self.results = None
46
+ # set user defined options used globally
47
+ self.tz = tz
48
+ self.hl = hl
49
+ self.geo = geo
50
+ self.kw_list = list()
51
+ self.timeout = timeout
52
+ self.proxies = proxies # add a proxy option
53
+ self.retries = retries
54
+ self.backoff_factor = backoff_factor
55
+ self.proxy_index = 0
56
+ self.requests_args = requests_args or {}
57
+ self.cookies = self.GetGoogleCookie()
58
+ # intialize widget payloads
59
+ self.token_payload = dict()
60
+ self.interest_over_time_widget = dict()
61
+ self.interest_by_region_widget = dict()
62
+ self.related_topics_widget_list = list()
63
+ self.related_queries_widget_list = list()
64
+
65
+ self.headers = {'accept-language': self.hl}
66
+ self.headers.update(self.requests_args.pop('headers', {}))
67
+
68
+ def GetGoogleCookie(self):
69
+ """
70
+ Gets google cookie (used for each and every proxy; once on init otherwise)
71
+ Removes proxy from the list on proxy error
72
+ """
73
+ while True:
74
+ if "proxies" in self.requests_args:
75
+ try:
76
+ return dict(filter(lambda i: i[0] == 'NID', requests.get(
77
+ f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
78
+ timeout=self.timeout,
79
+ **self.requests_args
80
+ ).cookies.items()))
81
+ except:
82
+ continue
83
+ else:
84
+ if len(self.proxies) > 0:
85
+ proxy = {'https': self.proxies[self.proxy_index]}
86
+ else:
87
+ proxy = ''
88
+ try:
89
+ return dict(filter(lambda i: i[0] == 'NID', requests.get(
90
+ f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
91
+ timeout=self.timeout,
92
+ proxies=proxy,
93
+ **self.requests_args
94
+ ).cookies.items()))
95
+ except requests.exceptions.ProxyError:
96
+ print('Proxy error. Changing IP')
97
+ if len(self.proxies) > 1:
98
+ self.proxies.remove(self.proxies[self.proxy_index])
99
+ else:
100
+ print('No more proxies available. Bye!')
101
+ raise
102
+ continue
103
+
104
+ def GetNewProxy(self):
105
+ """
106
+ Increment proxy INDEX; zero on overflow
107
+ """
108
+ if self.proxy_index < (len(self.proxies) - 1):
109
+ self.proxy_index += 1
110
+ else:
111
+ self.proxy_index = 0
112
+
113
+ def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
114
+ """Send a request to Google and return the JSON response as a Python object
115
+ :param url: the url to which the request will be sent
116
+ :param method: the HTTP method ('get' or 'post')
117
+ :param trim_chars: how many characters should be trimmed off the beginning of the content of the response
118
+ before this is passed to the JSON parser
119
+ :param kwargs: any extra key arguments passed to the request builder (usually query parameters or data)
120
+ :return:
121
+ """
122
+ s = requests.session()
123
+ # Retries mechanism. Activated when one of statements >0 (best used for proxy)
124
+ if self.retries > 0 or self.backoff_factor > 0:
125
+ retry = Retry(total=self.retries, read=self.retries,
126
+ connect=self.retries,
127
+ backoff_factor=self.backoff_factor,
128
+ status_forcelist=TrendReq.ERROR_CODES,
129
+ method_whitelist=frozenset(['GET', 'POST']))
130
+ s.mount('https://', HTTPAdapter(max_retries=retry))
131
+
132
+ s.headers.update(self.headers)
133
+ if len(self.proxies) > 0:
134
+ self.cookies = self.GetGoogleCookie()
135
+ s.proxies.update({'https': self.proxies[self.proxy_index]})
136
+ if method == TrendReq.POST_METHOD:
137
+ response = s.post(url, timeout=self.timeout,
138
+ cookies=self.cookies, **kwargs,
139
+ **self.requests_args) # DO NOT USE retries or backoff_factor here
140
+ else:
141
+ response = s.get(url, timeout=self.timeout, cookies=self.cookies,
142
+ **kwargs, **self.requests_args) # DO NOT USE retries or backoff_factor here
143
+ # check if the response contains json and throw an exception otherwise
144
+ # Google mostly sends 'application/json' in the Content-Type header,
145
+ # but occasionally it sends 'application/javascript
146
+ # and sometimes even 'text/javascript
147
+ if response.status_code == 200 and 'application/json' in \
148
+ response.headers['Content-Type'] or \
149
+ 'application/javascript' in response.headers['Content-Type'] or \
150
+ 'text/javascript' in response.headers['Content-Type']:
151
+ # trim initial characters
152
+ # some responses start with garbage characters, like ")]}',"
153
+ # these have to be cleaned before being passed to the json parser
154
+ content = response.text[trim_chars:]
155
+ # parse json
156
+ self.GetNewProxy()
157
+ return json.loads(content)
158
+ else:
159
+ if response.status_code == status_codes.codes.too_many_requests:
160
+ raise exceptions.TooManyRequestsError.from_response(response)
161
+ raise exceptions.ResponseError.from_response(response)
162
+
163
+ def build_payload(self, kw_list, cat=0, timeframe='today 5-y', geo='',
164
+ gprop=''):
165
+ """Create the payload for related queries, interest over time and interest by region"""
166
+ if gprop not in ['', 'images', 'news', 'youtube', 'froogle']:
167
+ raise ValueError('gprop must be empty (to indicate web), images, news, youtube, or froogle')
168
+ self.kw_list = kw_list
169
+ self.geo = geo or self.geo
170
+ self.token_payload = {
171
+ 'hl': self.hl,
172
+ 'tz': self.tz,
173
+ 'req': {'comparisonItem': [], 'category': cat, 'property': gprop}
174
+ }
175
+
176
+ # Check if timeframe is a list
177
+ if isinstance(timeframe, list):
178
+ for index, kw in enumerate(self.kw_list):
179
+ keyword_payload = {'keyword': kw, 'time': timeframe[index], 'geo': self.geo}
180
+ self.token_payload['req']['comparisonItem'].append(keyword_payload)
181
+ else:
182
+ # build out json for each keyword with
183
+ for kw in self.kw_list:
184
+ keyword_payload = {'keyword': kw, 'time': timeframe, 'geo': self.geo}
185
+ self.token_payload['req']['comparisonItem'].append(keyword_payload)
186
+
187
+ # requests will mangle this if it is not a string
188
+ self.token_payload['req'] = json.dumps(self.token_payload['req'])
189
+ # get tokens
190
+ self._tokens()
191
+ return
192
+
193
+ def _tokens(self):
194
+ """Makes request to Google to get API tokens for interest over time, interest by region and related queries"""
195
+ # make the request and parse the returned json
196
+ widget_dicts = self._get_data(
197
+ url=TrendReq.GENERAL_URL,
198
+ method=TrendReq.POST_METHOD,
199
+ params=self.token_payload,
200
+ trim_chars=4,
201
+ )['widgets']
202
+ # order of the json matters...
203
+ first_region_token = True
204
+ # clear self.related_queries_widget_list and self.related_topics_widget_list
205
+ # of old keywords'widgets
206
+ self.related_queries_widget_list[:] = []
207
+ self.related_topics_widget_list[:] = []
208
+ # assign requests
209
+ for widget in widget_dicts:
210
+ if widget['id'] == 'TIMESERIES':
211
+ self.interest_over_time_widget = widget
212
+ if widget['id'] == 'GEO_MAP' and first_region_token:
213
+ self.interest_by_region_widget = widget
214
+ first_region_token = False
215
+ # response for each term, put into a list
216
+ if 'RELATED_TOPICS' in widget['id']:
217
+ self.related_topics_widget_list.append(widget)
218
+ if 'RELATED_QUERIES' in widget['id']:
219
+ self.related_queries_widget_list.append(widget)
220
+ return
221
+
222
+ def interest_over_time(self):
223
+ """Request data from Google's Interest Over Time section and return a dataframe"""
224
+
225
+ over_time_payload = {
226
+ # convert to string as requests will mangle
227
+ 'req': json.dumps(self.interest_over_time_widget['request']),
228
+ 'token': self.interest_over_time_widget['token'],
229
+ 'tz': self.tz
230
+ }
231
+
232
+ # make the request and parse the returned json
233
+ req_json = self._get_data(
234
+ url=TrendReq.INTEREST_OVER_TIME_URL,
235
+ method=TrendReq.GET_METHOD,
236
+ trim_chars=5,
237
+ params=over_time_payload,
238
+ )
239
+
240
+ df = pd.DataFrame(req_json['default']['timelineData'])
241
+ if (df.empty):
242
+ return df
243
+
244
+ df['date'] = pd.to_datetime(df['time'].astype(dtype='float64'),
245
+ unit='s')
246
+ df = df.set_index(['date']).sort_index()
247
+ # split list columns into seperate ones, remove brackets and split on comma
248
+ result_df = df['value'].apply(lambda x: pd.Series(
249
+ str(x).replace('[', '').replace(']', '').split(',')))
250
+ # rename each column with its search term, relying on order that google provides...
251
+ for idx, kw in enumerate(self.kw_list):
252
+ # there is currently a bug with assigning columns that may be
253
+ # parsed as a date in pandas: use explicit insert column method
254
+ result_df.insert(len(result_df.columns), kw,
255
+ result_df[idx].astype('int'))
256
+ del result_df[idx]
257
+
258
+ if 'isPartial' in df:
259
+ # make other dataframe from isPartial key data
260
+ # split list columns into seperate ones, remove brackets and split on comma
261
+ df = df.fillna(False)
262
+ result_df2 = df['isPartial'].apply(lambda x: pd.Series(
263
+ str(x).replace('[', '').replace(']', '').split(',')))
264
+ result_df2.columns = ['isPartial']
265
+ # Change to a bool type.
266
+ result_df2.isPartial = result_df2.isPartial == 'True'
267
+ # concatenate the two dataframes
268
+ final = pd.concat([result_df, result_df2], axis=1)
269
+ else:
270
+ final = result_df
271
+ final['isPartial'] = False
272
+
273
+ return final
274
+
275
+ def multirange_interest_over_time(self):
276
+ """Request data from Google's Interest Over Time section across different time ranges and return a dataframe"""
277
+
278
+ over_time_payload = {
279
+ # convert to string as requests will mangle
280
+ 'req': json.dumps(self.interest_over_time_widget['request']),
281
+ 'token': self.interest_over_time_widget['token'],
282
+ 'tz': self.tz
283
+ }
284
+
285
+ # make the request and parse the returned json
286
+ req_json = self._get_data(
287
+ url=TrendReq.MULTIRANGE_INTEREST_OVER_TIME_URL,
288
+ method=TrendReq.GET_METHOD,
289
+ trim_chars=5,
290
+ params=over_time_payload,
291
+ )
292
+
293
+ df = pd.DataFrame(req_json['default']['timelineData'])
294
+ if (df.empty):
295
+ return df
296
+
297
+ result_df = pd.json_normalize(df['columnData'])
298
+
299
+ # Split dictionary columns into seperate ones
300
+ for i, column in enumerate(result_df.columns):
301
+ result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = result_df[i].apply(pd.Series)["formattedTime"]
302
+ result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = result_df[i].apply(pd.Series)["value"]
303
+ result_df = result_df.drop([i], axis=1)
304
+
305
+ # Adds a row with the averages at the top of the dataframe
306
+ avg_row = {}
307
+ for i, avg in enumerate(req_json['default']['averages']):
308
+ avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = "Average"
309
+ avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = req_json['default']['averages'][i]
310
+
311
+ result_df.loc[-1] = avg_row
312
+ result_df.index = result_df.index + 1
313
+ result_df = result_df.sort_index()
314
+
315
+ return result_df
316
+
317
+
318
+ def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
319
+ inc_geo_code=False):
320
+ """Request data from Google's Interest by Region section and return a dataframe"""
321
+
322
+ # make the request
323
+ region_payload = dict()
324
+ if self.geo == '':
325
+ self.interest_by_region_widget['request'][
326
+ 'resolution'] = resolution
327
+ elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
328
+ self.interest_by_region_widget['request'][
329
+ 'resolution'] = resolution
330
+
331
+ self.interest_by_region_widget['request'][
332
+ 'includeLowSearchVolumeGeos'] = inc_low_vol
333
+
334
+ # convert to string as requests will mangle
335
+ region_payload['req'] = json.dumps(
336
+ self.interest_by_region_widget['request'])
337
+ region_payload['token'] = self.interest_by_region_widget['token']
338
+ region_payload['tz'] = self.tz
339
+
340
+ # parse returned json
341
+ req_json = self._get_data(
342
+ url=TrendReq.INTEREST_BY_REGION_URL,
343
+ method=TrendReq.GET_METHOD,
344
+ trim_chars=5,
345
+ params=region_payload,
346
+ )
347
+ df = pd.DataFrame(req_json['default']['geoMapData'])
348
+ if (df.empty):
349
+ return df
350
+
351
+ # rename the column with the search keyword
352
+ geo_column = 'geoCode' if 'geoCode' in df.columns else 'coordinates'
353
+ columns = ['geoName', geo_column, 'value']
354
+ df = df[columns].set_index(['geoName']).sort_index()
355
+ # split list columns into separate ones, remove brackets and split on comma
356
+ result_df = df['value'].apply(lambda x: pd.Series(
357
+ str(x).replace('[', '').replace(']', '').split(',')))
358
+ if inc_geo_code:
359
+ if geo_column in df.columns:
360
+ result_df[geo_column] = df[geo_column]
361
+ else:
362
+ print('Could not find geo_code column; Skipping')
363
+
364
+ # rename each column with its search term
365
+ for idx, kw in enumerate(self.kw_list):
366
+ result_df[kw] = result_df[idx].astype('int')
367
+ del result_df[idx]
368
+
369
+ return result_df
370
+
371
+ def related_topics(self):
372
+ """Request data from Google's Related Topics section and return a dictionary of dataframes
373
+
374
+ If no top and/or rising related topics are found, the value for the key "top" and/or "rising" will be None
375
+ """
376
+
377
+ # make the request
378
+ related_payload = dict()
379
+ result_dict = dict()
380
+ for request_json in self.related_topics_widget_list:
381
+ # ensure we know which keyword we are looking at rather than relying on order
382
+ try:
383
+ kw = request_json['request']['restriction'][
384
+ 'complexKeywordsRestriction']['keyword'][0]['value']
385
+ except KeyError:
386
+ kw = ''
387
+ # convert to string as requests will mangle
388
+ related_payload['req'] = json.dumps(request_json['request'])
389
+ related_payload['token'] = request_json['token']
390
+ related_payload['tz'] = self.tz
391
+
392
+ # parse the returned json
393
+ req_json = self._get_data(
394
+ url=TrendReq.RELATED_QUERIES_URL,
395
+ method=TrendReq.GET_METHOD,
396
+ trim_chars=5,
397
+ params=related_payload,
398
+ )
399
+
400
+ # top topics
401
+ try:
402
+ top_list = req_json['default']['rankedList'][0]['rankedKeyword']
403
+ df_top = pd.json_normalize(top_list, sep='_')
404
+ except KeyError:
405
+ # in case no top topics are found, the lines above will throw a KeyError
406
+ df_top = None
407
+
408
+ # rising topics
409
+ try:
410
+ rising_list = req_json['default']['rankedList'][1]['rankedKeyword']
411
+ df_rising = pd.json_normalize(rising_list, sep='_')
412
+ except KeyError:
413
+ # in case no rising topics are found, the lines above will throw a KeyError
414
+ df_rising = None
415
+
416
+ result_dict[kw] = {'rising': df_rising, 'top': df_top}
417
+ return result_dict
418
+
419
+ def related_queries(self):
420
+ """Request data from Google's Related Queries section and return a dictionary of dataframes
421
+
422
+ If no top and/or rising related queries are found, the value for the key "top" and/or "rising" will be None
423
+ """
424
+
425
+ # make the request
426
+ related_payload = dict()
427
+ result_dict = dict()
428
+ for request_json in self.related_queries_widget_list:
429
+ # ensure we know which keyword we are looking at rather than relying on order
430
+ try:
431
+ kw = request_json['request']['restriction'][
432
+ 'complexKeywordsRestriction']['keyword'][0]['value']
433
+ except KeyError:
434
+ kw = ''
435
+ # convert to string as requests will mangle
436
+ related_payload['req'] = json.dumps(request_json['request'])
437
+ related_payload['token'] = request_json['token']
438
+ related_payload['tz'] = self.tz
439
+
440
+ # parse the returned json
441
+ req_json = self._get_data(
442
+ url=TrendReq.RELATED_QUERIES_URL,
443
+ method=TrendReq.GET_METHOD,
444
+ trim_chars=5,
445
+ params=related_payload,
446
+ )
447
+
448
+ # top queries
449
+ try:
450
+ top_df = pd.DataFrame(
451
+ req_json['default']['rankedList'][0]['rankedKeyword'])
452
+ top_df = top_df[['query', 'value']]
453
+ except KeyError:
454
+ # in case no top queries are found, the lines above will throw a KeyError
455
+ top_df = None
456
+
457
+ # rising queries
458
+ try:
459
+ rising_df = pd.DataFrame(
460
+ req_json['default']['rankedList'][1]['rankedKeyword'])
461
+ rising_df = rising_df[['query', 'value']]
462
+ except KeyError:
463
+ # in case no rising queries are found, the lines above will throw a KeyError
464
+ rising_df = None
465
+
466
+ result_dict[kw] = {'top': top_df, 'rising': rising_df}
467
+ return result_dict
468
+
469
+ def trending_searches(self, pn='united_states'):
470
+ """Request data from Google's Hot Searches section and return a dataframe"""
471
+
472
+ # make the request
473
+ # forms become obsolete due to the new TRENDING_SEARCHES_URL
474
+ # forms = {'ajax': 1, 'pn': pn, 'htd': '', 'htv': 'l'}
475
+ req_json = self._get_data(
476
+ url=TrendReq.TRENDING_SEARCHES_URL,
477
+ method=TrendReq.GET_METHOD
478
+ )[pn]
479
+ print(req_json)
480
+ result_df = pd.DataFrame(req_json)
481
+ return result_df
482
+
483
+ def today_searches(self, pn='US'):
484
+ """Request data from Google Daily Trends section and returns a dataframe"""
485
+ forms = {'ns': 15, 'geo': pn, 'tz': '-180', 'hl': self.hl}
486
+ req_json = self._get_data(
487
+ url=TrendReq.TODAY_SEARCHES_URL,
488
+ method=TrendReq.GET_METHOD,
489
+ trim_chars=5,
490
+ params=forms,
491
+ **self.requests_args
492
+ )['default']['trendingSearchesDays'][0]['trendingSearches']
493
+ # parse the returned jso
494
+
495
+ return req_json
496
+
497
+ def realtime_trending_searches(self, pn='US', cat='all', count =300):
498
+ """Request data from Google Realtime Search Trends section and returns a dataframe"""
499
+ # Don't know what some of the params mean here, followed the nodejs library
500
+ # https://github.com/pat310/google-trends-api/ 's implemenration
501
+
502
+
503
+ #sort: api accepts only 0 as the value, optional parameter
504
+
505
+ # ri: number of trending stories IDs returned,
506
+ # max value of ri supported is 300, based on emperical evidence
507
+
508
+ ri_value = 300
509
+ if count < ri_value:
510
+ ri_value = count
511
+
512
+ # rs : don't know what is does but it's max value is never more than the ri_value based on emperical evidence
513
+ # max value of ri supported is 200, based on emperical evidence
514
+ rs_value = 200
515
+ if count < rs_value:
516
+ rs_value = count-1
517
+
518
+ forms = {'ns': 15, 'geo': pn, 'tz': '300', 'hl': self.hl, 'cat': cat, 'fi' : '0', 'fs' : '0', 'ri' : ri_value, 'rs' : rs_value, 'sort' : 0}
519
+ req_json = self._get_data(
520
+ url=TrendReq.REALTIME_TRENDING_SEARCHES_URL,
521
+ method=TrendReq.GET_METHOD,
522
+ trim_chars=5,
523
+ params=forms
524
+ )['storySummaries']['trendingStories']
525
+
526
+ return req_json
527
+
528
+ def top_charts(self, date, hl='en-US', tz=300, geo='GLOBAL'):
529
+ """Request data from Google's Top Charts section and return a dataframe"""
530
+
531
+ try:
532
+ date = int(date)
533
+ except:
534
+ raise ValueError(
535
+ 'The date must be a year with format YYYY. See https://github.com/GeneralMills/pytrends/issues/355')
536
+
537
+ # create the payload
538
+ chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
539
+ 'isMobile': False}
540
+
541
+ # make the request and parse the returned json
542
+ req_json = self._get_data(
543
+ url=TrendReq.TOP_CHARTS_URL,
544
+ method=TrendReq.GET_METHOD,
545
+ trim_chars=5,
546
+ params=chart_payload
547
+ )
548
+ try:
549
+ df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
550
+ except IndexError:
551
+ df = None
552
+ return df
553
+
554
+ def trends(self, date, hl='en-US', tz=300, geo='GLOBAL'):
555
+ """Request data from Google's Top Charts section and return a dataframe"""
556
+
557
+ # create the payload
558
+ chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
559
+ 'isMobile': False}
560
+
561
+ # make the request and parse the returned json
562
+ req_json = self._get_data(
563
+ url=TrendReq.GENERAL_URL,
564
+ method=TrendReq.GET_METHOD,
565
+ trim_chars=5,
566
+ params=chart_payload
567
+ )
568
+ try:
569
+ df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
570
+ except IndexError:
571
+ df = None
572
+ return df
573
+
574
+ def suggestions(self, keyword):
575
+ """Request data from Google's Keyword Suggestion dropdown and return a dictionary"""
576
+
577
+ # make the request
578
+ kw_param = quote(keyword)
579
+ parameters = {'hl': self.hl}
580
+
581
+ req_json = self._get_data(
582
+ url=TrendReq.SUGGESTIONS_URL + kw_param,
583
+ params=parameters,
584
+ method=TrendReq.GET_METHOD,
585
+ trim_chars=5
586
+ )['default']['topics']
587
+ return req_json
588
+
589
+ def categories(self):
590
+ """Request available categories data from Google's API and return a dictionary"""
591
+
592
+ params = {'hl': self.hl}
593
+
594
+ req_json = self._get_data(
595
+ url=TrendReq.CATEGORIES_URL,
596
+ params=params,
597
+ method=TrendReq.GET_METHOD,
598
+ trim_chars=5
599
+ )
600
+ return req_json
601
+
602
+ def get_historical_interest(self, *args, **kwargs):
603
+ raise NotImplementedError(
604
+ """This method has been removed for incorrectness. It will be removed completely in v5.
605
+ If you'd like similar functionality, please try implementing it yourself and consider submitting a pull request to add it to pytrends.
606
+
607
+ There is discussion at:
608
+ https://github.com/GeneralMills/pytrends/pull/542"""
609
+ )