Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- Dockerfile +32 -0
- app.py +159 -0
- requirements.txt +5 -0
Dockerfile
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM selenium/standalone-chrome:latest
|
2 |
+
|
3 |
+
USER root
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
6 |
+
# build-essential \
|
7 |
+
# git \
|
8 |
+
# python3 \
|
9 |
+
# python3-dev \
|
10 |
+
# python3-numpy \
|
11 |
+
python3-pip \
|
12 |
+
# python3-setuptools \
|
13 |
+
openssl \
|
14 |
+
# libffi-dev \
|
15 |
+
# libssl-dev \
|
16 |
+
# python3-wheel \
|
17 |
+
&& rm -rf /var/lib/apt/lists/*
|
18 |
+
|
19 |
+
COPY requirements.txt .
|
20 |
+
RUN pip3 install -r requirements.txt
|
21 |
+
RUN pip3 list
|
22 |
+
|
23 |
+
# RUN useradd -m -u 1000 user
|
24 |
+
|
25 |
+
# USER user
|
26 |
+
|
27 |
+
WORKDIR /app
|
28 |
+
COPY . /app
|
29 |
+
|
30 |
+
EXPOSE 7860
|
31 |
+
|
32 |
+
ENTRYPOINT ["python3","app.py"]
|
app.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from seleniumwire import webdriver
|
2 |
+
from selenium.webdriver.chrome.options import Options
|
3 |
+
from fastapi import FastAPI, Request
|
4 |
+
import uvicorn
|
5 |
+
import time
|
6 |
+
import json
|
7 |
+
from urllib.parse import unquote, urlparse, quote_plus
|
8 |
+
|
9 |
+
app = FastAPI()
|
10 |
+
|
11 |
+
# 解析cookie字符串为字典
|
12 |
+
def convert_cookies_to_dict(cookies):
|
13 |
+
cookie_items = cookies.split("; ")
|
14 |
+
parsed_cookies = {item.split("=", 1)[0].strip(): item.split("=", 1)[1].strip() if "=" in item else "" for item in cookie_items}
|
15 |
+
return parsed_cookies#
|
16 |
+
|
17 |
+
# 获取域名字符串的根域
|
18 |
+
def get_root_domain(url):
|
19 |
+
parsed_url = urlparse(url)
|
20 |
+
domain = parsed_url.netloc
|
21 |
+
|
22 |
+
parts = domain.split('.')
|
23 |
+
if len(parts) > 1:
|
24 |
+
return '.'.join(parts[-2:])
|
25 |
+
else:
|
26 |
+
return domain
|
27 |
+
|
28 |
+
# 尝试对字符串做 json 解析,如果失败则返回原字符串
|
29 |
+
def try_json_decode(headers):
|
30 |
+
try:
|
31 |
+
return json.loads(str(headers))
|
32 |
+
except Exception as e:
|
33 |
+
return headers
|
34 |
+
|
35 |
+
@app.get("/")
|
36 |
+
def main():
|
37 |
+
return {"code": 200,"msg":"Success"}
|
38 |
+
|
39 |
+
@app.get("/chrome")
|
40 |
+
def chrome(url:str=None,wait:int=5,header:str=None,cookie:str=None,cookie_domain:str=None):
|
41 |
+
# 开启捕获HAR数据功能,允许使用 driver.har 进行检索
|
42 |
+
seleniumwire_options = {
|
43 |
+
'enable_har': True
|
44 |
+
}
|
45 |
+
|
46 |
+
# 必须有目标url
|
47 |
+
if type(url) == str:
|
48 |
+
target_url = unquote(url)
|
49 |
+
target_domain = get_root_domain(target_url)
|
50 |
+
else:
|
51 |
+
return {"code": 500,"msg":"No target URL"}
|
52 |
+
|
53 |
+
# 等待时间必须在 0 到 30 之间
|
54 |
+
if wait in range(0, 31):
|
55 |
+
wait_time = wait
|
56 |
+
else:
|
57 |
+
return {"code": 500,"msg":"The waiting time must be between 0 and 30"}
|
58 |
+
|
59 |
+
header_array = {}
|
60 |
+
|
61 |
+
# header可以覆写,但必须传入json
|
62 |
+
try:
|
63 |
+
if type(header) == str:
|
64 |
+
header_array.update(json.loads(unquote(header)))
|
65 |
+
except Exception as e:
|
66 |
+
return {"code": 500,"msg":"The header field is not JSON"}
|
67 |
+
|
68 |
+
# 如果输入了cookie
|
69 |
+
if type(cookie) == str:
|
70 |
+
header_array.update({"cookie":unquote(cookie)})
|
71 |
+
|
72 |
+
# 初始化浏览器
|
73 |
+
options = Options()
|
74 |
+
|
75 |
+
# 设置为无头模式
|
76 |
+
options.add_argument('--headless')
|
77 |
+
|
78 |
+
# 实例化
|
79 |
+
driver = webdriver.Chrome(options=options,seleniumwire_options=seleniumwire_options)
|
80 |
+
|
81 |
+
# 需要打开网址页面,才能用 driver.add_cookie 进行cookie追加
|
82 |
+
driver.get(target_url)
|
83 |
+
|
84 |
+
# 清除本次打开网址页面,可能存储在本地的cookie、sessionStorage、localStorage,并删除因此次访问所产生的 network 和 har 记录
|
85 |
+
driver.delete_all_cookies()
|
86 |
+
driver.execute_script("window.sessionStorage.clear();")
|
87 |
+
driver.execute_script("window.localStorage.clear();")
|
88 |
+
del driver.requests
|
89 |
+
|
90 |
+
# 对浏览器追加我们传递进来的cookie
|
91 |
+
if 'cookie' in header_array:
|
92 |
+
cookie_array = convert_cookies_to_dict(header_array['cookie'])
|
93 |
+
del header_array['cookie']
|
94 |
+
|
95 |
+
if type(cookie_domain) == str:
|
96 |
+
domain = cookie_domain
|
97 |
+
else:
|
98 |
+
domain = f'.{target_domain}'
|
99 |
+
|
100 |
+
for key, value in cookie_array.items():
|
101 |
+
try:
|
102 |
+
driver.add_cookie({"name": key, "value": quote_plus(value), "domain": domain, "path": "/"})
|
103 |
+
except Exception as e:
|
104 |
+
print("Error Cookie:")
|
105 |
+
print({"name": key, "value": quote_plus(value), "domain": domain, "path": "/"})
|
106 |
+
|
107 |
+
# 把下次访问中的请求头修改成我们需要的样式(没有修改的项目则保持原样)
|
108 |
+
driver.header_overrides = header_array
|
109 |
+
|
110 |
+
# 再次访问网址
|
111 |
+
driver.get(target_url)
|
112 |
+
|
113 |
+
# 输出此时访问的网页源码
|
114 |
+
# print(driver.page_source)
|
115 |
+
|
116 |
+
# 等待多少秒,来预估网页完全的加载完成(执行完内部的所有js,因为部分js可能涉及到请求后的动态处理,或者延时跳转)
|
117 |
+
if wait_time > 0:
|
118 |
+
time.sleep(wait_time)
|
119 |
+
|
120 |
+
# 获取完全加载完成时,页面的URL
|
121 |
+
current_url = driver.current_url
|
122 |
+
|
123 |
+
# 获取完全加载完成时,页面的源代码
|
124 |
+
page_source = driver.page_source
|
125 |
+
|
126 |
+
# 获取完全加载完成时,页面的cookie
|
127 |
+
cookies = driver.get_cookies()
|
128 |
+
|
129 |
+
# 完全加载完成时,页面是否有发生过 301 302 跳转过
|
130 |
+
is_jump = (target_url != current_url)
|
131 |
+
|
132 |
+
network = []
|
133 |
+
# 遍历输出过程中的 network(使用非 har 文件的摘要方式输出)
|
134 |
+
for request in driver.requests:
|
135 |
+
if request.response:
|
136 |
+
network.append({
|
137 |
+
"method":request.method,
|
138 |
+
"status":request.response.status_code ,
|
139 |
+
"url":request.url,
|
140 |
+
"responseheaders":{k: try_json_decode(v) for k, v in request.response.headers.items()},
|
141 |
+
"requestheaders":{k: try_json_decode(v) for k, v in request.headers.items()},
|
142 |
+
})
|
143 |
+
|
144 |
+
# driver.har 将调用 har 记录,输出最为完整的 network 数据流
|
145 |
+
# print(driver.har)
|
146 |
+
|
147 |
+
data = {
|
148 |
+
"url": current_url,
|
149 |
+
"page_source": page_source,
|
150 |
+
"end_cookies": cookies,
|
151 |
+
"is_jump": is_jump,
|
152 |
+
"network": network
|
153 |
+
}
|
154 |
+
|
155 |
+
driver.quit()
|
156 |
+
return {"code": 200,"data":data}
|
157 |
+
|
158 |
+
if __name__ == '__main__':
|
159 |
+
uvicorn.run(app='app:app', host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
selenium
|
4 |
+
selenium-wire
|
5 |
+
blinker==1.7.0
|