Spaces:

faridans27
/

jobs-scraper

Paused

App Files Files Community

MuhFaridanSutariya commited on May 7

Commit

1400c09

•

1 Parent(s): 5623b18

first init

Browse files

Files changed (2) hide show

app.py +111 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import streamlit as st
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+# Instantiate global variables
+df = pd.DataFrame(columns=["Title", "Location", "Company", "Link", "Description"])
+# Get user input
+inputJobTitle = st.text_input("Enter Job Title:")
+inputJobLocation = st.text_input("Enter Job Location:")
+totalPages = st.number_input("Enter Total Pages:", min_value=1, value=1)
+submit_button = st.button("Submit")
+def scrapeJobDescription(url):
+    options = Options()
+    options.add_argument("--window-size=1920,1080")
+    options.add_argument("--headless=new")
+    driver = webdriver.Chrome(options=options)
+    driver.get(url)
+    html = driver.page_source
+    soup = BeautifulSoup(html, "html.parser")
+    try:
+        jobDescription = soup.find(
+            "div", class_="show-more-less-html__markup"
+        ).text.strip()
+        return jobDescription
+    except:
+        return ""
+def scrapeLinkedin():
+    global df
+    global inputJobTitle
+    global inputJobLocation
+    counter = 0
+    pageCounter = 1
+    options = Options()
+    options.add_argument("--window-size=1920,1080")
+    options.add_argument("--headless=new")
+    driver = webdriver.Chrome(options=options)
+    while pageCounter <= totalPages:
+        try:
+            driver.get(
+                f"https://www.linkedin.com/jobs/search/?&keywords={inputJobTitle}&location={inputJobLocation}&refresh=true&start={counter}"
+            )
+            html = driver.page_source
+            soup = BeautifulSoup(html, "html.parser")
+            ulElement = soup.find("ul", class_="jobs-search__results-list")
+            liElements = ulElement.find_all("li")
+            for item in liElements:
+                jobTitle = item.find(
+                    "h3", class_="base-search-card__title"
+                ).text.strip()
+                jobLocation = item.find(
+                    "span", class_="job-search-card__location"
+                ).text.strip()
+                jobCompany = item.find(
+                    "h4", class_="base-search-card__subtitle"
+                ).text.strip()
+                jobLink = item.find_all("a")[0]["href"]
+                jobDescription = scrapeJobDescription(jobLink)
+                if jobTitle and jobLocation and jobCompany and jobLink:
+                    df = pd.concat(
+                        [
+                            df,
+                            pd.DataFrame(
+                                {
+                                    "Title": [jobTitle],
+                                    "Location": [jobLocation],
+                                    "Company": [jobCompany],
+                                    "Link": [jobLink],
+                                    "Description": [jobDescription],
+                                }
+                            ),
+                        ]
+                    )
+            counter += 25
+            pageCounter += 1
+        except:
+            break
+    driver.quit()
+def convert_df(df):
+   return df.to_csv(index=False).encode('utf-8')
+if submit_button:
+    with st.spinner("Operation in progress. Please wait..."):
+        scrapeLinkedin()
+        time.sleep(1)
+    st.write(df)
+    csv = convert_df(df)
+    st.download_button(
+        "Press to Download",
+        csv,
+        "file.csv",
+        "text/csv",
+        key='download-csv'
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pandas
+streamlit
+selenium
+bs4