File size: 8,044 Bytes
4d3e798 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import re
import datetime
import os
import random
class ReedsyCrawler:
def __init__(self):
path = './'
if not os.path.exists(path):
os.mkdir(path)
if not os.path.exists(path + '/stories'):
os.mkdir(path + '/stories')
self.folder = path
self.prompt_df = self.get_prompts()
def run(self):
for i, r in self.prompt_df[self.prompt_df['done'] == 0].iterrows():
try:
story_df, posted_date = self.get_stories(r)
story_df.to_csv(os.path.join(self.folder, 'stories', '{}.csv'.format(r['prompt_id'])), index=None)
self.prompt_df.loc[i, 'done'] = 1
self.prompt_df.loc[i, 'posted_date'] = posted_date
self.prompt_df.to_csv(os.path.join(self.folder, 'prompt.csv'), index=None)
except:
print(r)
def scrape_story(self, i, r):
story_df, posted_date = self.get_stories(r)
story_df.to_csv(os.path.join(self.folder, 'stories', '{}.csv'.format(r['prompt_id'])), index=None)
self.prompt_df.loc[i, 'done'] = 1
self.prompt_df.loc[i, 'posted_date'] = posted_date
self.prompt_df.to_csv(os.path.join(self.folder, 'prompt.csv'), index=None)
def get_soup(self,url):
attempt = 1
while True:
response = requests.get(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
)
soup = BeautifulSoup(response.content)
if response.status_code == 200:
time.sleep(0.2)
return soup
elif attempt >=5:
print('Attempt >= 5', url)
else:
attempt+=1
time.sleep(3)
def get_prompts(self):
# 1 get all prompts
# try to read prompt.csv
if os.path.exists(os.path.join(self.folder, 'prompt.csv')):
prompt_df = pd.read_csv(os.path.join(self.folder, 'prompt.csv'))
else:
prompt_df = pd.DataFrame(columns=['prompt_id', 'prompt', 'category', 'link', 'no_of_stories', 'done'])
prompt_result_list = []
# First page
soup = self.get_soup('https://blog.reedsy.com/creative-writing-prompts/')
prompt_last_page = int(soup.find_all('span', class_='page')[-1].find('a').text) # get the last page of prompts
prompt_list = soup.find_all('div', class_='prompt panel panel-thin panel-white-bordered space-bottom-xs-md') # get a list of all prompts in page
if len(prompt_df) > 0:
prompt_count = int(prompt_df['prompt_id'].max()[8:]) + 1
else:
prompt_count = 1
for i in range(1, prompt_last_page+1):
soup = self.get_soup(f'https://blog.reedsy.com/creative-writing-prompts/page/{i}')
time.sleep(1)
prompt_list = soup.find_all('div', class_='prompt panel panel-thin panel-white-bordered space-bottom-xs-md')
for p in prompt_list:
if p.find('p').text.strip()[:4] == 'LIVE':
continue # Skip the prompt if it's LIVE
category = p.find('p').text.split('–')[0].strip()
try:
count = int(p.find('p').text.split('–')[-1].replace(' stories', '').strip())
except:
count = 0
prompt = p.find('a').text
if len(prompt_df[prompt_df['prompt'] == prompt])==0:
new_row = {
'prompt_id': f"prompt_{prompt_count:04d}",
'prompt': p.find('a').text,
'category': category,
'link': 'https://blog.reedsy.com' + p.find('a').get('href'),
'no_of_stories': count,
'done': 0
}
prompt_result_list.append(new_row)
prompt_count += 1
new_prompt_df = pd.DataFrame(prompt_result_list)
prompt_df = pd.concat([prompt_df, new_prompt_df])
prompt_df.to_csv(os.path.join(self.folder, 'prompt.csv'), index=None)
return prompt_df
def get_stories(self, r):
# # 2. get all stories link
prompt_story_list = []
prompt_id = r['prompt_id']
prompt = r['prompt']
next_page = r['link']
while next_page:
prompt_soup = self.get_soup(next_page)
posted_date = prompt_soup.find('section', class_='row-thin').find('p').text.split('on')[-1].strip()
try:
next_page = 'https://blog.reedsy.com' + prompt_soup.find('div', id='submissions-load').find('a').get('href')
except:
next_page = None
stories = prompt_soup.find_all('div', class_='submission')
for s in stories:
new_row = {
'prompt_id': prompt_id,
'prompt': prompt,
'story_id': s.find('h3', class_='mimic-h4').find_all('a')[0].get('href').split('/')[-2],
'story_title': s.find('h3', class_='mimic-h4').find_all('a')[0].text,
'story_author': s.find('h3', class_='mimic-h4').find_all('a')[1].text,
'story_url': 'https://blog.reedsy.com' + s.find('h3', class_='mimic-h4').find_all('a')[0].get('href'),
'link': s.find('a').get('href'),
'genre': r['category'],
'is_sensitive': 0
}
new_row = self.get_story_info(new_row)
sensitive_div = s.find('div', class_='panel panel-thinner panel-white-bordered space-bottom-xs-md space-top-xs-md')
if sensitive_div:
if sensitive_div.text.strip() == 'This story contains sensitive content':
new_row['is_sensitive'] = 1
prompt_story_list.append(new_row)
prompt_story_df = pd.DataFrame(prompt_story_list)
prompt_story_df.to_csv(os.path.join(self.folder, 'stories', '{}.csv'.format(r['prompt_id'])), index=None)
return prompt_story_df, posted_date
def get_story_info(self, r, comment=True):
# 3. get story infomation
story_comment_list = []
story_id = r['story_id']
story_soup = self.get_soup(r['story_url'])
print(r['story_url'])
r['categories'] = str([a.text for a in story_soup.find('p', class_='small space-top-xs-md').find_all('a')])
r['likes'] = story_soup.find('p', class_='text-grey space-top-xs-md').find('button').text.strip()
r['story_text'] = story_soup.find('article').text
r['posted_date'] = story_soup.find('div', class_='grid gutter-xs-md no-response space-bottom-xs-md').find('div', class_='cell-shrink').text.strip()
if comment:
comment_soup = story_soup
while comment_soup:
story_comment_list += self.get_comment(comment_soup)
next_page = comment_soup.find('a', attrs={'rel': 'next'})
if next_page:
comment_soup = self.get_soup('https://blog.reedsy.com' + next_page.get('href'))
print('https://blog.reedsy.com' + next_page.get('href'))
else:
comment_soup = None
r['comments'] = story_comment_list
return r
def get_comment_info(self, c):
try:
author = c.find('a', class_='profile-link text-weight-normal text-blue').text.strip()
except:
author = 'Unknown user'
regex = re.compile('small text-wrap.*')
comment_text = c.find('p', class_=regex).text.strip()
time = c.find('small', class_='text-grey').text.strip()
points = c.find('button', class_='btn-no-decoration trigger-signup-modal text-grey').text.replace('points', '').strip()
return {
author: comment_text,
'time': time,
'points': points
}
def get_comment(self, s):
# handle all comments
story_comment_list = []
comment_containers = s.find_all('div', class_='comment-container')
for cc in comment_containers:
comments = cc.find_all('div', class_='comment')
# main comment
if len(comments) > 0:
main_comment = self.get_comment_info(comments[0])
nested_comments = []
if len(comments) > 1:
for c in comments[1:]:
nested_comments.append(self.get_comment_info(c))
story_comment_list.append([main_comment, nested_comments])
return story_comment_list
if __name__ == '__main__':
rc = ReedsyCrawler()
rc.run() |