Spaces:
Sleeping
Sleeping
File size: 3,147 Bytes
7f7f8e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import requests
from bs4 import BeautifulSoup
import csv
import re
from datetime import datetime
import streamlit as st
import pandas as pd
import time
def scrape_detik_search(query, num_pages):
base_url = "https://www.detik.com/search/searchnews"
all_results = []
for page in range(1, num_pages + 1):
params = {
"query": query,
"sortby": "time",
"page": page
}
response = requests.get(base_url, params=params)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
for article in articles:
title = article.find('h2', class_='title').text.strip()
# Extract date and time using regular expressions
date_pattern = r'\d{1,2} [a-zA-Z]+ \d{4}'
time_pattern = r'\d{2}:\d{2}'
date_match = re.search(date_pattern, article.find('span', class_='date').text.strip())
time_match = re.search(time_pattern, article.find('span', class_='date').text.strip())
if date_match and time_match:
extracted_date = date_match.group()
extracted_time = time_match.group()
# # Convert the extracted date string to a datetime object
# datetime_object = datetime.strptime(extracted_date, '%d %b %Y')
# # Create a formatted date string
# formatted_date = datetime_object.strftime('%Y-%m-%d')
link = article.find('a')['href']
category = article.find('span', class_='category').text.strip()
all_results.append([title, link, category, extracted_date, extracted_time])
with st.spinner('Mohon Ditunggu...'):
time.sleep(1)
else:
st.write(f"Error {response.status_code}: Unable to retrieve data from Page {page}.")
return all_results
def save_to_csv(results, filename):
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header
writer.writerow(["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
# Write the data
writer.writerows(results)
# Streamlit app
st.title("Detik.com Search Results Scraper")
# Get user input for search query and number of pages
search_query = st.sidebar.text_input("Enter the search query:")
num_pages_to_scrape = st.sidebar.number_input("Select the number of pages to scrape:", 1, 100, 1)
# Scrape Detik.com search results
if st.sidebar.button("Scrape Results"):
search_results = scrape_detik_search(search_query, num_pages_to_scrape)
with st.expander('Data Preview'):
df= pd.DataFrame(search_results, columns= ["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
st.dataframe(df)
# Save results to a CSV file
csv_filename = f"{search_query.replace(' ', '_')}_detik_search_results.csv"
save_to_csv(search_results, csv_filename)
st.success(f"Results saved to {csv_filename}")
|