workspace / readsy /crawlReedsy.py
Penghaoo's picture
End of training
4d3e798 verified
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import re
import datetime
import os
import random
class ReedsyCrawler:
def __init__(self):
path = './'
if not os.path.exists(path):
os.mkdir(path)
if not os.path.exists(path + '/stories'):
os.mkdir(path + '/stories')
self.folder = path
self.prompt_df = self.get_prompts()
def run(self):
for i, r in self.prompt_df[self.prompt_df['done'] == 0].iterrows():
try:
story_df, posted_date = self.get_stories(r)
story_df.to_csv(os.path.join(self.folder, 'stories', '{}.csv'.format(r['prompt_id'])), index=None)
self.prompt_df.loc[i, 'done'] = 1
self.prompt_df.loc[i, 'posted_date'] = posted_date
self.prompt_df.to_csv(os.path.join(self.folder, 'prompt.csv'), index=None)
except:
print(r)
def scrape_story(self, i, r):
story_df, posted_date = self.get_stories(r)
story_df.to_csv(os.path.join(self.folder, 'stories', '{}.csv'.format(r['prompt_id'])), index=None)
self.prompt_df.loc[i, 'done'] = 1
self.prompt_df.loc[i, 'posted_date'] = posted_date
self.prompt_df.to_csv(os.path.join(self.folder, 'prompt.csv'), index=None)
def get_soup(self,url):
attempt = 1
while True:
response = requests.get(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
)
soup = BeautifulSoup(response.content)
if response.status_code == 200:
time.sleep(0.2)
return soup
elif attempt >=5:
print('Attempt >= 5', url)
else:
attempt+=1
time.sleep(3)
def get_prompts(self):
# 1 get all prompts
# try to read prompt.csv
if os.path.exists(os.path.join(self.folder, 'prompt.csv')):
prompt_df = pd.read_csv(os.path.join(self.folder, 'prompt.csv'))
else:
prompt_df = pd.DataFrame(columns=['prompt_id', 'prompt', 'category', 'link', 'no_of_stories', 'done'])
prompt_result_list = []
# First page
soup = self.get_soup('https://blog.reedsy.com/creative-writing-prompts/')
prompt_last_page = int(soup.find_all('span', class_='page')[-1].find('a').text) # get the last page of prompts
prompt_list = soup.find_all('div', class_='prompt panel panel-thin panel-white-bordered space-bottom-xs-md') # get a list of all prompts in page
if len(prompt_df) > 0:
prompt_count = int(prompt_df['prompt_id'].max()[8:]) + 1
else:
prompt_count = 1
for i in range(1, prompt_last_page+1):
soup = self.get_soup(f'https://blog.reedsy.com/creative-writing-prompts/page/{i}')
time.sleep(1)
prompt_list = soup.find_all('div', class_='prompt panel panel-thin panel-white-bordered space-bottom-xs-md')
for p in prompt_list:
if p.find('p').text.strip()[:4] == 'LIVE':
continue # Skip the prompt if it's LIVE
category = p.find('p').text.split('–')[0].strip()
try:
count = int(p.find('p').text.split('–')[-1].replace(' stories', '').strip())
except:
count = 0
prompt = p.find('a').text
if len(prompt_df[prompt_df['prompt'] == prompt])==0:
new_row = {
'prompt_id': f"prompt_{prompt_count:04d}",
'prompt': p.find('a').text,
'category': category,
'link': 'https://blog.reedsy.com' + p.find('a').get('href'),
'no_of_stories': count,
'done': 0
}
prompt_result_list.append(new_row)
prompt_count += 1
new_prompt_df = pd.DataFrame(prompt_result_list)
prompt_df = pd.concat([prompt_df, new_prompt_df])
prompt_df.to_csv(os.path.join(self.folder, 'prompt.csv'), index=None)
return prompt_df
def get_stories(self, r):
# # 2. get all stories link
prompt_story_list = []
prompt_id = r['prompt_id']
prompt = r['prompt']
next_page = r['link']
while next_page:
prompt_soup = self.get_soup(next_page)
posted_date = prompt_soup.find('section', class_='row-thin').find('p').text.split('on')[-1].strip()
try:
next_page = 'https://blog.reedsy.com' + prompt_soup.find('div', id='submissions-load').find('a').get('href')
except:
next_page = None
stories = prompt_soup.find_all('div', class_='submission')
for s in stories:
new_row = {
'prompt_id': prompt_id,
'prompt': prompt,
'story_id': s.find('h3', class_='mimic-h4').find_all('a')[0].get('href').split('/')[-2],
'story_title': s.find('h3', class_='mimic-h4').find_all('a')[0].text,
'story_author': s.find('h3', class_='mimic-h4').find_all('a')[1].text,
'story_url': 'https://blog.reedsy.com' + s.find('h3', class_='mimic-h4').find_all('a')[0].get('href'),
'link': s.find('a').get('href'),
'genre': r['category'],
'is_sensitive': 0
}
new_row = self.get_story_info(new_row)
sensitive_div = s.find('div', class_='panel panel-thinner panel-white-bordered space-bottom-xs-md space-top-xs-md')
if sensitive_div:
if sensitive_div.text.strip() == 'This story contains sensitive content':
new_row['is_sensitive'] = 1
prompt_story_list.append(new_row)
prompt_story_df = pd.DataFrame(prompt_story_list)
prompt_story_df.to_csv(os.path.join(self.folder, 'stories', '{}.csv'.format(r['prompt_id'])), index=None)
return prompt_story_df, posted_date
def get_story_info(self, r, comment=True):
# 3. get story infomation
story_comment_list = []
story_id = r['story_id']
story_soup = self.get_soup(r['story_url'])
print(r['story_url'])
r['categories'] = str([a.text for a in story_soup.find('p', class_='small space-top-xs-md').find_all('a')])
r['likes'] = story_soup.find('p', class_='text-grey space-top-xs-md').find('button').text.strip()
r['story_text'] = story_soup.find('article').text
r['posted_date'] = story_soup.find('div', class_='grid gutter-xs-md no-response space-bottom-xs-md').find('div', class_='cell-shrink').text.strip()
if comment:
comment_soup = story_soup
while comment_soup:
story_comment_list += self.get_comment(comment_soup)
next_page = comment_soup.find('a', attrs={'rel': 'next'})
if next_page:
comment_soup = self.get_soup('https://blog.reedsy.com' + next_page.get('href'))
print('https://blog.reedsy.com' + next_page.get('href'))
else:
comment_soup = None
r['comments'] = story_comment_list
return r
def get_comment_info(self, c):
try:
author = c.find('a', class_='profile-link text-weight-normal text-blue').text.strip()
except:
author = 'Unknown user'
regex = re.compile('small text-wrap.*')
comment_text = c.find('p', class_=regex).text.strip()
time = c.find('small', class_='text-grey').text.strip()
points = c.find('button', class_='btn-no-decoration trigger-signup-modal text-grey').text.replace('points', '').strip()
return {
author: comment_text,
'time': time,
'points': points
}
def get_comment(self, s):
# handle all comments
story_comment_list = []
comment_containers = s.find_all('div', class_='comment-container')
for cc in comment_containers:
comments = cc.find_all('div', class_='comment')
# main comment
if len(comments) > 0:
main_comment = self.get_comment_info(comments[0])
nested_comments = []
if len(comments) > 1:
for c in comments[1:]:
nested_comments.append(self.get_comment_info(c))
story_comment_list.append([main_comment, nested_comments])
return story_comment_list
if __name__ == '__main__':
rc = ReedsyCrawler()
rc.run()