PolyakovK's picture
initial
a23f3fb
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.DataFrame(columns=['page_url', 'image_url', 'author', 'title', 'annotation'])
def extract_data_from_page(page_number):
url = f'https://www.chitai-gorod.ru/catalog/books/hudozhestvennaya-literatura-110001?page={page_number}'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
books = soup.find_all('article', class_='product-card')
data = []
for book in books:
try:
book_url = book.find('a', class_='product-card__picture')['href']
title = book.find('div', class_='product-title__head').get_text(strip=True)
author = book.find('div', class_='product-title__author').get_text(strip=True)
absolute_url = f'https://www.chitai-gorod.ru{book_url}'
data.append({'page_url': absolute_url, 'title': title, 'author': author})
except Exception as e:
print(f"Error processing book: {e}")
return data
for page in range(2, 201):
print(f"Processing page {page}...")
page_data = extract_data_from_page(page)
df = pd.concat([df, pd.DataFrame(page_data)], ignore_index=True)
if len(df) >= 5000:
break
df = df.head(5000)
def extract_book_details(book_url):
try:
response = requests.get(book_url)
soup = BeautifulSoup(response.content, 'html.parser')
image_tag = soup.find('meta', {'name': 'og:image'})
image_url = image_tag['content'] if image_tag else None
annotation_tag = soup.find('div', {'itemprop': 'description'})
annotation = annotation_tag.get_text(strip=True) if annotation_tag else None
return image_url, annotation
except Exception as e:
print(f"Error extracting details from {book_url}: {e}")
return None, None
for idx, row in df.head(5000).iterrows():
print(f"Fetching details for {row['page_url']}...")
image_url, annotation = extract_book_details(row['page_url'])
df.at[idx, 'image_url'] = image_url
df.at[idx, 'annotation'] = annotation
df.to_csv('books_data_with_details.csv', index=False)