Add1E commited on
Commit
f971e85
·
verified ·
1 Parent(s): 6dda1aa

Upload 8 files

Browse files
pytrends/__init__.py ADDED
File without changes
pytrends/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (186 Bytes). View file
 
pytrends/__pycache__/dailydata.cpython-310.pyc ADDED
Binary file (4.73 kB). View file
 
pytrends/__pycache__/exceptions.cpython-310.pyc ADDED
Binary file (1.11 kB). View file
 
pytrends/__pycache__/request.cpython-310.pyc ADDED
Binary file (15.4 kB). View file
 
pytrends/dailydata.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import date, timedelta
2
+ from functools import partial
3
+ from time import sleep
4
+ from calendar import monthrange
5
+
6
+ import pandas as pd
7
+
8
+ from pytrends.exceptions import ResponseError
9
+ from pytrends.request import TrendReq
10
+
11
+
12
+ def get_last_date_of_month(year: int, month: int) -> date:
13
+ """Given a year and a month returns an instance of the date class
14
+ containing the last day of the corresponding month.
15
+
16
+ Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
17
+ """
18
+ return date(year, month, monthrange(year, month)[1])
19
+
20
+
21
+ def convert_dates_to_timeframe(start: date, stop: date) -> str:
22
+ """Given two dates, returns a stringified version of the interval between
23
+ the two dates which is used to retrieve data for a specific time frame
24
+ from Google Trends.
25
+ """
26
+ return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"
27
+
28
+
29
+ def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
30
+ """Attempts to fecth data and retries in case of a ResponseError."""
31
+ attempts, fetched = 0, False
32
+ while not fetched:
33
+ try:
34
+ build_payload(timeframe=timeframe)
35
+ except ResponseError as err:
36
+ print(err)
37
+ print(f'Trying again in {60 + 5 * attempts} seconds.')
38
+ sleep(60 + 5 * attempts)
39
+ attempts += 1
40
+ if attempts > 3:
41
+ print('Failed after 3 attemps, abort fetching.')
42
+ break
43
+ else:
44
+ fetched = True
45
+ return pytrends.interest_over_time()
46
+
47
+
48
+ def get_daily_data(word: str,
49
+ start_year: int,
50
+ start_mon: int,
51
+ stop_year: int,
52
+ stop_mon: int,
53
+ geo: str = 'US',
54
+ verbose: bool = True,
55
+ wait_time: float = 5.0) -> pd.DataFrame:
56
+ """Given a word, fetches daily search volume data from Google Trends and
57
+ returns results in a pandas DataFrame.
58
+
59
+ Details: Due to the way Google Trends scales and returns data, special
60
+ care needs to be taken to make the daily data comparable over different
61
+ months. To do that, we download daily data on a month by month basis,
62
+ and also monthly data. The monthly data is downloaded in one go, so that
63
+ the monthly values are comparable amongst themselves and can be used to
64
+ scale the daily data. The daily data is scaled by multiplying the daily
65
+ value by the monthly search volume divided by 100.
66
+ For a more detailed explanation see http://bit.ly/trendsscaling
67
+
68
+ Args:
69
+ word (str): Word to fetch daily data for.
70
+ start_year (int): the start year
71
+ start_mon (int): start 1st day of the month
72
+ stop_year (int): the end year
73
+ stop_mon (int): end at the last day of the month
74
+ geo (str): geolocation
75
+ verbose (bool): If True, then prints the word and current time frame
76
+ we are fecthing the data for.
77
+
78
+ Returns:
79
+ complete (pd.DataFrame): Contains 4 columns.
80
+ The column named after the word argument contains the daily search
81
+ volume already scaled and comparable through time.
82
+ The column f'{word}_unscaled' is the original daily data fetched
83
+ month by month, and it is not comparable across different months
84
+ (but is comparable within a month).
85
+ The column f'{word}_monthly' contains the original monthly data
86
+ fetched at once. The values in this column have been backfilled
87
+ so that there are no NaN present.
88
+ The column 'scale' contains the scale used to obtain the scaled
89
+ daily data.
90
+ """
91
+
92
+ # Set up start and stop dates
93
+ start_date = date(start_year, start_mon, 1)
94
+ stop_date = get_last_date_of_month(stop_year, stop_mon)
95
+
96
+ # Start pytrends for US region
97
+ pytrends = TrendReq(hl='en-US', tz=360)
98
+ # Initialize build_payload with the word we need data for
99
+ build_payload = partial(pytrends.build_payload,
100
+ kw_list=[word], cat=0, geo=geo, gprop='')
101
+
102
+ # Obtain monthly data for all months in years [start_year, stop_year]
103
+ monthly = _fetch_data(pytrends, build_payload,
104
+ convert_dates_to_timeframe(start_date, stop_date))
105
+
106
+ # Get daily data, month by month
107
+ results = {}
108
+ # if a timeout or too many requests error occur we need to adjust wait time
109
+ current = start_date
110
+ while current < stop_date:
111
+ last_date_of_month = get_last_date_of_month(current.year, current.month)
112
+ timeframe = convert_dates_to_timeframe(current, last_date_of_month)
113
+ if verbose:
114
+ print(f'{word}:{timeframe}')
115
+ results[current] = _fetch_data(pytrends, build_payload, timeframe)
116
+ current = last_date_of_month + timedelta(days=1)
117
+ sleep(wait_time) # don't go too fast or Google will send 429s
118
+
119
+ daily = pd.concat(results.values()).drop(columns=['isPartial'])
120
+ complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')
121
+
122
+ # Scale daily data by monthly weights so the data is comparable
123
+ complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values
124
+ complete['scale'] = complete[f'{word}_monthly'] / 100
125
+ complete[word] = complete[f'{word}_unscaled'] * complete.scale
126
+
127
+ return complete
pytrends/exceptions.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ResponseError(Exception):
2
+ """ Something was wrong with the response from Google. """
3
+
4
+ def __init__(self, message, response):
5
+ super().__init__(message)
6
+ # pass response so it can be handled upstream
7
+ self.response = response
8
+
9
+ @classmethod
10
+ def from_response(cls, response):
11
+ message = f'The request failed: Google returned a response with code {response.status_code}'
12
+ return cls(message, response)
13
+
14
+
15
+ class TooManyRequestsError(ResponseError):
16
+ """ Exception raised when the backend returns a 429 error code. """
17
+ pass
pytrends/request.py ADDED
@@ -0,0 +1,594 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pandas as pd
4
+ import requests
5
+
6
+ from requests.adapters import HTTPAdapter
7
+ from requests.packages.urllib3.util.retry import Retry
8
+ from requests import status_codes
9
+
10
+ from pytrends import exceptions
11
+
12
+ from urllib.parse import quote
13
+
14
+
15
+ BASE_TRENDS_URL = 'https://trends.google.com/trends'
16
+
17
+
18
+ class TrendReq(object):
19
+ """
20
+ Google Trends API
21
+ """
22
+ GET_METHOD = 'get'
23
+ POST_METHOD = 'post'
24
+ GENERAL_URL = f'{BASE_TRENDS_URL}/api/explore'
25
+ INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multiline'
26
+ MULTIRANGE_INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multirange'
27
+ INTEREST_BY_REGION_URL = f'{BASE_TRENDS_URL}/api/widgetdata/comparedgeo'
28
+ RELATED_QUERIES_URL = f'{BASE_TRENDS_URL}/api/widgetdata/relatedsearches'
29
+ TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/hottrends/visualize/internal/data'
30
+ TOP_CHARTS_URL = f'{BASE_TRENDS_URL}/api/topcharts'
31
+ SUGGESTIONS_URL = f'{BASE_TRENDS_URL}/api/autocomplete/'
32
+ CATEGORIES_URL = f'{BASE_TRENDS_URL}/api/explore/pickers/category'
33
+ TODAY_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/dailytrends'
34
+ REALTIME_TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/realtimetrends'
35
+ ERROR_CODES = (500, 502, 504, 429)
36
+
37
+ def __init__(self, hl='en-US', tz=360, geo='', timeout=(2, 5), proxies='',
38
+ retries=0, backoff_factor=0, requests_args=None):
39
+ """
40
+ Initialize default values for params
41
+ """
42
+ # google rate limit
43
+ self.google_rl = 'You have reached your quota limit. Please try again later.'
44
+ self.results = None
45
+ # set user defined options used globally
46
+ self.tz = tz
47
+ self.hl = hl
48
+ self.geo = geo
49
+ self.kw_list = list()
50
+ self.timeout = timeout
51
+ self.proxies = proxies # add a proxy option
52
+ self.retries = retries
53
+ self.backoff_factor = backoff_factor
54
+ self.proxy_index = 0
55
+ self.requests_args = requests_args or {}
56
+ self.cookies = self.GetGoogleCookie()
57
+ # intialize widget payloads
58
+ self.token_payload = dict()
59
+ self.interest_over_time_widget = dict()
60
+ self.interest_by_region_widget = dict()
61
+ self.related_topics_widget_list = list()
62
+ self.related_queries_widget_list = list()
63
+
64
+ self.headers = {'accept-language': self.hl}
65
+ self.headers.update(self.requests_args.pop('headers', {}))
66
+
67
+ def GetGoogleCookie(self):
68
+ """
69
+ Gets google cookie (used for each and every proxy; once on init otherwise)
70
+ Removes proxy from the list on proxy error
71
+ """
72
+ while True:
73
+ if "proxies" in self.requests_args:
74
+ try:
75
+ return dict(filter(lambda i: i[0] == 'NID', requests.get(
76
+ f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
77
+ timeout=self.timeout,
78
+ **self.requests_args
79
+ ).cookies.items()))
80
+ except:
81
+ continue
82
+ else:
83
+ if len(self.proxies) > 0:
84
+ proxy = {'https': self.proxies[self.proxy_index]}
85
+ else:
86
+ proxy = ''
87
+ try:
88
+ return dict(filter(lambda i: i[0] == 'NID', requests.get(
89
+ f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
90
+ timeout=self.timeout,
91
+ proxies=proxy,
92
+ **self.requests_args
93
+ ).cookies.items()))
94
+ except requests.exceptions.ProxyError:
95
+ print('Proxy error. Changing IP')
96
+ if len(self.proxies) > 1:
97
+ self.proxies.remove(self.proxies[self.proxy_index])
98
+ else:
99
+ print('No more proxies available. Bye!')
100
+ raise
101
+ continue
102
+
103
+ def GetNewProxy(self):
104
+ """
105
+ Increment proxy INDEX; zero on overflow
106
+ """
107
+ if self.proxy_index < (len(self.proxies) - 1):
108
+ self.proxy_index += 1
109
+ else:
110
+ self.proxy_index = 0
111
+
112
+ def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
113
+ """Send a request to Google and return the JSON response as a Python object
114
+ :param url: the url to which the request will be sent
115
+ :param method: the HTTP method ('get' or 'post')
116
+ :param trim_chars: how many characters should be trimmed off the beginning of the content of the response
117
+ before this is passed to the JSON parser
118
+ :param kwargs: any extra key arguments passed to the request builder (usually query parameters or data)
119
+ :return:
120
+ """
121
+ s = requests.session()
122
+ # Retries mechanism. Activated when one of statements >0 (best used for proxy)
123
+ if self.retries > 0 or self.backoff_factor > 0:
124
+ retry = Retry(total=self.retries, read=self.retries,
125
+ connect=self.retries,
126
+ backoff_factor=self.backoff_factor,
127
+ status_forcelist=TrendReq.ERROR_CODES,
128
+ method_whitelist=frozenset(['GET', 'POST']))
129
+ s.mount('https://', HTTPAdapter(max_retries=retry))
130
+
131
+ s.headers.update(self.headers)
132
+ if len(self.proxies) > 0:
133
+ self.cookies = self.GetGoogleCookie()
134
+ s.proxies.update({'https': self.proxies[self.proxy_index]})
135
+ if method == TrendReq.POST_METHOD:
136
+ response = s.post(url, timeout=self.timeout,
137
+ cookies=self.cookies, **kwargs,
138
+ **self.requests_args) # DO NOT USE retries or backoff_factor here
139
+ else:
140
+ response = s.get(url, timeout=self.timeout, cookies=self.cookies,
141
+ **kwargs, **self.requests_args) # DO NOT USE retries or backoff_factor here
142
+ # check if the response contains json and throw an exception otherwise
143
+ # Google mostly sends 'application/json' in the Content-Type header,
144
+ # but occasionally it sends 'application/javascript
145
+ # and sometimes even 'text/javascript
146
+ if response.status_code == 200 and 'application/json' in \
147
+ response.headers['Content-Type'] or \
148
+ 'application/javascript' in response.headers['Content-Type'] or \
149
+ 'text/javascript' in response.headers['Content-Type']:
150
+ # trim initial characters
151
+ # some responses start with garbage characters, like ")]}',"
152
+ # these have to be cleaned before being passed to the json parser
153
+ content = response.text[trim_chars:]
154
+ # parse json
155
+ self.GetNewProxy()
156
+ return json.loads(content)
157
+ else:
158
+ if response.status_code == status_codes.codes.too_many_requests:
159
+ raise exceptions.TooManyRequestsError.from_response(response)
160
+ raise exceptions.ResponseError.from_response(response)
161
+
162
+ def build_payload(self, kw_list, cat=0, timeframe='today 5-y', geo='',
163
+ gprop=''):
164
+ """Create the payload for related queries, interest over time and interest by region"""
165
+ if gprop not in ['', 'images', 'news', 'youtube', 'froogle']:
166
+ raise ValueError('gprop must be empty (to indicate web), images, news, youtube, or froogle')
167
+ self.kw_list = kw_list
168
+ self.geo = geo or self.geo
169
+ self.token_payload = {
170
+ 'hl': self.hl,
171
+ 'tz': self.tz,
172
+ 'req': {'comparisonItem': [], 'category': cat, 'property': gprop}
173
+ }
174
+
175
+ # Check if timeframe is a list
176
+ if isinstance(timeframe, list):
177
+ for index, kw in enumerate(self.kw_list):
178
+ keyword_payload = {'keyword': kw, 'time': timeframe[index], 'geo': self.geo}
179
+ self.token_payload['req']['comparisonItem'].append(keyword_payload)
180
+ else:
181
+ # build out json for each keyword with
182
+ for kw in self.kw_list:
183
+ keyword_payload = {'keyword': kw, 'time': timeframe, 'geo': self.geo}
184
+ self.token_payload['req']['comparisonItem'].append(keyword_payload)
185
+
186
+ # requests will mangle this if it is not a string
187
+ self.token_payload['req'] = json.dumps(self.token_payload['req'])
188
+ # get tokens
189
+ self._tokens()
190
+ return
191
+
192
+ def _tokens(self):
193
+ """Makes request to Google to get API tokens for interest over time, interest by region and related queries"""
194
+ # make the request and parse the returned json
195
+ widget_dicts = self._get_data(
196
+ url=TrendReq.GENERAL_URL,
197
+ method=TrendReq.POST_METHOD,
198
+ params=self.token_payload,
199
+ trim_chars=4,
200
+ )['widgets']
201
+ # order of the json matters...
202
+ first_region_token = True
203
+ # clear self.related_queries_widget_list and self.related_topics_widget_list
204
+ # of old keywords'widgets
205
+ self.related_queries_widget_list[:] = []
206
+ self.related_topics_widget_list[:] = []
207
+ # assign requests
208
+ for widget in widget_dicts:
209
+ if widget['id'] == 'TIMESERIES':
210
+ self.interest_over_time_widget = widget
211
+ if widget['id'] == 'GEO_MAP' and first_region_token:
212
+ self.interest_by_region_widget = widget
213
+ first_region_token = False
214
+ # response for each term, put into a list
215
+ if 'RELATED_TOPICS' in widget['id']:
216
+ self.related_topics_widget_list.append(widget)
217
+ if 'RELATED_QUERIES' in widget['id']:
218
+ self.related_queries_widget_list.append(widget)
219
+ return
220
+
221
+ def interest_over_time(self):
222
+ """Request data from Google's Interest Over Time section and return a dataframe"""
223
+
224
+ over_time_payload = {
225
+ # convert to string as requests will mangle
226
+ 'req': json.dumps(self.interest_over_time_widget['request']),
227
+ 'token': self.interest_over_time_widget['token'],
228
+ 'tz': self.tz
229
+ }
230
+
231
+ # make the request and parse the returned json
232
+ req_json = self._get_data(
233
+ url=TrendReq.INTEREST_OVER_TIME_URL,
234
+ method=TrendReq.GET_METHOD,
235
+ trim_chars=5,
236
+ params=over_time_payload,
237
+ )
238
+
239
+ df = pd.DataFrame(req_json['default']['timelineData'])
240
+ if (df.empty):
241
+ return df
242
+
243
+ df['date'] = pd.to_datetime(df['time'].astype(dtype='float64'),
244
+ unit='s')
245
+ df = df.set_index(['date']).sort_index()
246
+ # split list columns into seperate ones, remove brackets and split on comma
247
+ result_df = df['value'].apply(lambda x: pd.Series(
248
+ str(x).replace('[', '').replace(']', '').split(',')))
249
+ # rename each column with its search term, relying on order that google provides...
250
+ for idx, kw in enumerate(self.kw_list):
251
+ # there is currently a bug with assigning columns that may be
252
+ # parsed as a date in pandas: use explicit insert column method
253
+ result_df.insert(len(result_df.columns), kw,
254
+ result_df[idx].astype('int'))
255
+ del result_df[idx]
256
+
257
+ if 'isPartial' in df:
258
+ # make other dataframe from isPartial key data
259
+ # split list columns into seperate ones, remove brackets and split on comma
260
+ df = df.fillna(False)
261
+ result_df2 = df['isPartial'].apply(lambda x: pd.Series(
262
+ str(x).replace('[', '').replace(']', '').split(',')))
263
+ result_df2.columns = ['isPartial']
264
+ # Change to a bool type.
265
+ result_df2.isPartial = result_df2.isPartial == 'True'
266
+ # concatenate the two dataframes
267
+ final = pd.concat([result_df, result_df2], axis=1)
268
+ else:
269
+ final = result_df
270
+ final['isPartial'] = False
271
+
272
+ return final
273
+
274
+ def multirange_interest_over_time(self):
275
+ """Request data from Google's Interest Over Time section across different time ranges and return a dataframe"""
276
+
277
+ over_time_payload = {
278
+ # convert to string as requests will mangle
279
+ 'req': json.dumps(self.interest_over_time_widget['request']),
280
+ 'token': self.interest_over_time_widget['token'],
281
+ 'tz': self.tz
282
+ }
283
+
284
+ # make the request and parse the returned json
285
+ req_json = self._get_data(
286
+ url=TrendReq.MULTIRANGE_INTEREST_OVER_TIME_URL,
287
+ method=TrendReq.GET_METHOD,
288
+ trim_chars=5,
289
+ params=over_time_payload,
290
+ )
291
+
292
+ df = pd.DataFrame(req_json['default']['timelineData'])
293
+ if (df.empty):
294
+ return df
295
+
296
+ result_df = pd.json_normalize(df['columnData'])
297
+
298
+ # Split dictionary columns into seperate ones
299
+ for i, column in enumerate(result_df.columns):
300
+ result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = result_df[i].apply(pd.Series)["formattedTime"]
301
+ result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = result_df[i].apply(pd.Series)["value"]
302
+ result_df = result_df.drop([i], axis=1)
303
+
304
+ # Adds a row with the averages at the top of the dataframe
305
+ avg_row = {}
306
+ for i, avg in enumerate(req_json['default']['averages']):
307
+ avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = "Average"
308
+ avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = req_json['default']['averages'][i]
309
+
310
+ result_df.loc[-1] = avg_row
311
+ result_df.index = result_df.index + 1
312
+ result_df = result_df.sort_index()
313
+
314
+ return result_df
315
+
316
+
317
+ def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
318
+ inc_geo_code=False):
319
+ """Request data from Google's Interest by Region section and return a dataframe"""
320
+
321
+ # make the request
322
+ region_payload = dict()
323
+ if self.geo == '':
324
+ self.interest_by_region_widget['request'][
325
+ 'resolution'] = resolution
326
+ elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
327
+ self.interest_by_region_widget['request'][
328
+ 'resolution'] = resolution
329
+
330
+ self.interest_by_region_widget['request'][
331
+ 'includeLowSearchVolumeGeos'] = inc_low_vol
332
+
333
+ # convert to string as requests will mangle
334
+ region_payload['req'] = json.dumps(
335
+ self.interest_by_region_widget['request'])
336
+ region_payload['token'] = self.interest_by_region_widget['token']
337
+ region_payload['tz'] = self.tz
338
+
339
+ # parse returned json
340
+ req_json = self._get_data(
341
+ url=TrendReq.INTEREST_BY_REGION_URL,
342
+ method=TrendReq.GET_METHOD,
343
+ trim_chars=5,
344
+ params=region_payload,
345
+ )
346
+ df = pd.DataFrame(req_json['default']['geoMapData'])
347
+ if (df.empty):
348
+ return df
349
+
350
+ # rename the column with the search keyword
351
+ geo_column = 'geoCode' if 'geoCode' in df.columns else 'coordinates'
352
+ columns = ['geoName', geo_column, 'value']
353
+ df = df[columns].set_index(['geoName']).sort_index()
354
+ # split list columns into separate ones, remove brackets and split on comma
355
+ result_df = df['value'].apply(lambda x: pd.Series(
356
+ str(x).replace('[', '').replace(']', '').split(',')))
357
+ if inc_geo_code:
358
+ if geo_column in df.columns:
359
+ result_df[geo_column] = df[geo_column]
360
+ else:
361
+ print('Could not find geo_code column; Skipping')
362
+
363
+ # rename each column with its search term
364
+ for idx, kw in enumerate(self.kw_list):
365
+ result_df[kw] = result_df[idx].astype('int')
366
+ del result_df[idx]
367
+
368
+ return result_df
369
+
370
+ def related_topics(self):
371
+ """Request data from Google's Related Topics section and return a dictionary of dataframes
372
+
373
+ If no top and/or rising related topics are found, the value for the key "top" and/or "rising" will be None
374
+ """
375
+
376
+ # make the request
377
+ related_payload = dict()
378
+ result_dict = dict()
379
+ for request_json in self.related_topics_widget_list:
380
+ # ensure we know which keyword we are looking at rather than relying on order
381
+ try:
382
+ kw = request_json['request']['restriction'][
383
+ 'complexKeywordsRestriction']['keyword'][0]['value']
384
+ except KeyError:
385
+ kw = ''
386
+ # convert to string as requests will mangle
387
+ related_payload['req'] = json.dumps(request_json['request'])
388
+ related_payload['token'] = request_json['token']
389
+ related_payload['tz'] = self.tz
390
+
391
+ # parse the returned json
392
+ req_json = self._get_data(
393
+ url=TrendReq.RELATED_QUERIES_URL,
394
+ method=TrendReq.GET_METHOD,
395
+ trim_chars=5,
396
+ params=related_payload,
397
+ )
398
+
399
+ # top topics
400
+ try:
401
+ top_list = req_json['default']['rankedList'][0]['rankedKeyword']
402
+ df_top = pd.json_normalize(top_list, sep='_')
403
+ except KeyError:
404
+ # in case no top topics are found, the lines above will throw a KeyError
405
+ df_top = None
406
+
407
+ # rising topics
408
+ try:
409
+ rising_list = req_json['default']['rankedList'][1]['rankedKeyword']
410
+ df_rising = pd.json_normalize(rising_list, sep='_')
411
+ except KeyError:
412
+ # in case no rising topics are found, the lines above will throw a KeyError
413
+ df_rising = None
414
+
415
+ result_dict[kw] = {'rising': df_rising, 'top': df_top}
416
+ return result_dict
417
+
418
+ def related_queries(self):
419
+ """Request data from Google's Related Queries section and return a dictionary of dataframes
420
+
421
+ If no top and/or rising related queries are found, the value for the key "top" and/or "rising" will be None
422
+ """
423
+
424
+ # make the request
425
+ related_payload = dict()
426
+ result_dict = dict()
427
+ for request_json in self.related_queries_widget_list:
428
+ # ensure we know which keyword we are looking at rather than relying on order
429
+ try:
430
+ kw = request_json['request']['restriction'][
431
+ 'complexKeywordsRestriction']['keyword'][0]['value']
432
+ except KeyError:
433
+ kw = ''
434
+ # convert to string as requests will mangle
435
+ related_payload['req'] = json.dumps(request_json['request'])
436
+ related_payload['token'] = request_json['token']
437
+ related_payload['tz'] = self.tz
438
+
439
+ # parse the returned json
440
+ req_json = self._get_data(
441
+ url=TrendReq.RELATED_QUERIES_URL,
442
+ method=TrendReq.GET_METHOD,
443
+ trim_chars=5,
444
+ params=related_payload,
445
+ )
446
+
447
+ # top queries
448
+ try:
449
+ top_df = pd.DataFrame(
450
+ req_json['default']['rankedList'][0]['rankedKeyword'])
451
+ top_df = top_df[['query', 'value']]
452
+ except KeyError:
453
+ # in case no top queries are found, the lines above will throw a KeyError
454
+ top_df = None
455
+
456
+ # rising queries
457
+ try:
458
+ rising_df = pd.DataFrame(
459
+ req_json['default']['rankedList'][1]['rankedKeyword'])
460
+ rising_df = rising_df[['query', 'value']]
461
+ except KeyError:
462
+ # in case no rising queries are found, the lines above will throw a KeyError
463
+ rising_df = None
464
+
465
+ result_dict[kw] = {'top': top_df, 'rising': rising_df}
466
+ return result_dict
467
+
468
+ def trending_searches(self, pn='united_states'):
469
+ """Request data from Google's Hot Searches section and return a dataframe"""
470
+
471
+ # make the request
472
+ # forms become obsolete due to the new TRENDING_SEARCHES_URL
473
+ # forms = {'ajax': 1, 'pn': pn, 'htd': '', 'htv': 'l'}
474
+ req_json = self._get_data(
475
+ url=TrendReq.TRENDING_SEARCHES_URL,
476
+ method=TrendReq.GET_METHOD
477
+ )[pn]
478
+ result_df = pd.DataFrame(req_json)
479
+ return result_df
480
+
481
+ def today_searches(self, pn='US'):
482
+ """Request data from Google Daily Trends section and returns a dataframe"""
483
+ forms = {'ns': 15, 'geo': pn, 'tz': '-180', 'hl': self.hl}
484
+ req_json = self._get_data(
485
+ url=TrendReq.TODAY_SEARCHES_URL,
486
+ method=TrendReq.GET_METHOD,
487
+ trim_chars=5,
488
+ params=forms,
489
+ **self.requests_args
490
+ )['default']['trendingSearchesDays'][0]['trendingSearches']
491
+ # parse the returned json
492
+ result_df = pd.DataFrame(trend['title'] for trend in req_json)
493
+ return result_df.iloc[:, -1]
494
+
495
+ def realtime_trending_searches(self, pn='US', cat='all', count =300):
496
+ """Request data from Google Realtime Search Trends section and returns a dataframe"""
497
+ # Don't know what some of the params mean here, followed the nodejs library
498
+ # https://github.com/pat310/google-trends-api/ 's implemenration
499
+
500
+
501
+ #sort: api accepts only 0 as the value, optional parameter
502
+
503
+ # ri: number of trending stories IDs returned,
504
+ # max value of ri supported is 300, based on emperical evidence
505
+
506
+ ri_value = 300
507
+ if count < ri_value:
508
+ ri_value = count
509
+
510
+ # rs : don't know what is does but it's max value is never more than the ri_value based on emperical evidence
511
+ # max value of ri supported is 200, based on emperical evidence
512
+ rs_value = 200
513
+ if count < rs_value:
514
+ rs_value = count-1
515
+
516
+ forms = {'ns': 15, 'geo': pn, 'tz': '300', 'hl': self.hl, 'cat': cat, 'fi' : '0', 'fs' : '0', 'ri' : ri_value, 'rs' : rs_value, 'sort' : 0}
517
+ req_json = self._get_data(
518
+ url=TrendReq.REALTIME_TRENDING_SEARCHES_URL,
519
+ method=TrendReq.GET_METHOD,
520
+ trim_chars=5,
521
+ params=forms
522
+ )['storySummaries']['trendingStories']
523
+
524
+ # parse the returned json
525
+ #wanted_keys = ["entityNames", "title"]
526
+
527
+ #final_json = [{ key: ts[key] for key in ts.keys() if key in wanted_keys} for ts in req_json ]
528
+
529
+ #result_df = pd.DataFrame(final_json)
530
+
531
+ return req_json
532
+
533
+ def top_charts(self, date, hl='en-US', tz=300, geo='GLOBAL'):
534
+ """Request data from Google's Top Charts section and return a dataframe"""
535
+
536
+ try:
537
+ date = int(date)
538
+ except:
539
+ raise ValueError(
540
+ 'The date must be a year with format YYYY. See https://github.com/GeneralMills/pytrends/issues/355')
541
+
542
+ # create the payload
543
+ chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
544
+ 'isMobile': False}
545
+
546
+ # make the request and parse the returned json
547
+ req_json = self._get_data(
548
+ url=TrendReq.TOP_CHARTS_URL,
549
+ method=TrendReq.GET_METHOD,
550
+ trim_chars=5,
551
+ params=chart_payload
552
+ )
553
+ try:
554
+ df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
555
+ except IndexError:
556
+ df = None
557
+ return df
558
+
559
+ def suggestions(self, keyword):
560
+ """Request data from Google's Keyword Suggestion dropdown and return a dictionary"""
561
+
562
+ # make the request
563
+ kw_param = quote(keyword)
564
+ parameters = {'hl': self.hl}
565
+
566
+ req_json = self._get_data(
567
+ url=TrendReq.SUGGESTIONS_URL + kw_param,
568
+ params=parameters,
569
+ method=TrendReq.GET_METHOD,
570
+ trim_chars=5
571
+ )['default']['topics']
572
+ return req_json
573
+
574
+ def categories(self):
575
+ """Request available categories data from Google's API and return a dictionary"""
576
+
577
+ params = {'hl': self.hl}
578
+
579
+ req_json = self._get_data(
580
+ url=TrendReq.CATEGORIES_URL,
581
+ params=params,
582
+ method=TrendReq.GET_METHOD,
583
+ trim_chars=5
584
+ )
585
+ return req_json
586
+
587
+ def get_historical_interest(self, *args, **kwargs):
588
+ raise NotImplementedError(
589
+ """This method has been removed for incorrectness. It will be removed completely in v5.
590
+ If you'd like similar functionality, please try implementing it yourself and consider submitting a pull request to add it to pytrends.
591
+
592
+ There is discussion at:
593
+ https://github.com/GeneralMills/pytrends/pull/542"""
594
+ )