Spaces:
Paused
Paused
MuhFaridanSutariya
commited on
Commit
•
1400c09
1
Parent(s):
5623b18
first init
Browse files- app.py +111 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from selenium import webdriver
|
3 |
+
from selenium.webdriver.chrome.options import Options
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import pandas as pd
|
6 |
+
import time
|
7 |
+
|
8 |
+
# Instantiate global variables
|
9 |
+
df = pd.DataFrame(columns=["Title", "Location", "Company", "Link", "Description"])
|
10 |
+
|
11 |
+
# Get user input
|
12 |
+
inputJobTitle = st.text_input("Enter Job Title:")
|
13 |
+
inputJobLocation = st.text_input("Enter Job Location:")
|
14 |
+
totalPages = st.number_input("Enter Total Pages:", min_value=1, value=1)
|
15 |
+
submit_button = st.button("Submit")
|
16 |
+
|
17 |
+
def scrapeJobDescription(url):
|
18 |
+
options = Options()
|
19 |
+
options.add_argument("--window-size=1920,1080")
|
20 |
+
options.add_argument("--headless=new")
|
21 |
+
driver = webdriver.Chrome(options=options)
|
22 |
+
driver.get(url)
|
23 |
+
html = driver.page_source
|
24 |
+
soup = BeautifulSoup(html, "html.parser")
|
25 |
+
try:
|
26 |
+
jobDescription = soup.find(
|
27 |
+
"div", class_="show-more-less-html__markup"
|
28 |
+
).text.strip()
|
29 |
+
return jobDescription
|
30 |
+
except:
|
31 |
+
return ""
|
32 |
+
|
33 |
+
def scrapeLinkedin():
|
34 |
+
global df
|
35 |
+
global inputJobTitle
|
36 |
+
global inputJobLocation
|
37 |
+
counter = 0
|
38 |
+
pageCounter = 1
|
39 |
+
options = Options()
|
40 |
+
options.add_argument("--window-size=1920,1080")
|
41 |
+
options.add_argument("--headless=new")
|
42 |
+
driver = webdriver.Chrome(options=options)
|
43 |
+
|
44 |
+
while pageCounter <= totalPages:
|
45 |
+
try:
|
46 |
+
driver.get(
|
47 |
+
f"https://www.linkedin.com/jobs/search/?&keywords={inputJobTitle}&location={inputJobLocation}&refresh=true&start={counter}"
|
48 |
+
)
|
49 |
+
|
50 |
+
html = driver.page_source
|
51 |
+
soup = BeautifulSoup(html, "html.parser")
|
52 |
+
|
53 |
+
ulElement = soup.find("ul", class_="jobs-search__results-list")
|
54 |
+
liElements = ulElement.find_all("li")
|
55 |
+
|
56 |
+
for item in liElements:
|
57 |
+
jobTitle = item.find(
|
58 |
+
"h3", class_="base-search-card__title"
|
59 |
+
).text.strip()
|
60 |
+
jobLocation = item.find(
|
61 |
+
"span", class_="job-search-card__location"
|
62 |
+
).text.strip()
|
63 |
+
jobCompany = item.find(
|
64 |
+
"h4", class_="base-search-card__subtitle"
|
65 |
+
).text.strip()
|
66 |
+
jobLink = item.find_all("a")[0]["href"]
|
67 |
+
|
68 |
+
jobDescription = scrapeJobDescription(jobLink)
|
69 |
+
|
70 |
+
if jobTitle and jobLocation and jobCompany and jobLink:
|
71 |
+
df = pd.concat(
|
72 |
+
[
|
73 |
+
df,
|
74 |
+
pd.DataFrame(
|
75 |
+
{
|
76 |
+
"Title": [jobTitle],
|
77 |
+
"Location": [jobLocation],
|
78 |
+
"Company": [jobCompany],
|
79 |
+
"Link": [jobLink],
|
80 |
+
"Description": [jobDescription],
|
81 |
+
}
|
82 |
+
),
|
83 |
+
]
|
84 |
+
)
|
85 |
+
|
86 |
+
counter += 25
|
87 |
+
pageCounter += 1
|
88 |
+
except:
|
89 |
+
break
|
90 |
+
|
91 |
+
driver.quit()
|
92 |
+
|
93 |
+
def convert_df(df):
|
94 |
+
return df.to_csv(index=False).encode('utf-8')
|
95 |
+
|
96 |
+
|
97 |
+
if submit_button:
|
98 |
+
with st.spinner("Operation in progress. Please wait..."):
|
99 |
+
scrapeLinkedin()
|
100 |
+
time.sleep(1)
|
101 |
+
st.write(df)
|
102 |
+
|
103 |
+
|
104 |
+
csv = convert_df(df)
|
105 |
+
st.download_button(
|
106 |
+
"Press to Download",
|
107 |
+
csv,
|
108 |
+
"file.csv",
|
109 |
+
"text/csv",
|
110 |
+
key='download-csv'
|
111 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
streamlit
|
3 |
+
selenium
|
4 |
+
bs4
|