Container commited on
Commit
76c921d
·
verified ·
1 Parent(s): a2e2a5b

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +32 -0
  2. app.py +159 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM selenium/standalone-chrome:latest
2
+
3
+ USER root
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ # build-essential \
7
+ # git \
8
+ # python3 \
9
+ # python3-dev \
10
+ # python3-numpy \
11
+ python3-pip \
12
+ # python3-setuptools \
13
+ openssl \
14
+ # libffi-dev \
15
+ # libssl-dev \
16
+ # python3-wheel \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ COPY requirements.txt .
20
+ RUN pip3 install -r requirements.txt
21
+ RUN pip3 list
22
+
23
+ # RUN useradd -m -u 1000 user
24
+
25
+ # USER user
26
+
27
+ WORKDIR /app
28
+ COPY . /app
29
+
30
+ EXPOSE 7860
31
+
32
+ ENTRYPOINT ["python3","app.py"]
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from seleniumwire import webdriver
2
+ from selenium.webdriver.chrome.options import Options
3
+ from fastapi import FastAPI, Request
4
+ import uvicorn
5
+ import time
6
+ import json
7
+ from urllib.parse import unquote, urlparse, quote_plus
8
+
9
+ app = FastAPI()
10
+
11
+ # 解析cookie字符串为字典
12
+ def convert_cookies_to_dict(cookies):
13
+ cookie_items = cookies.split("; ")
14
+ parsed_cookies = {item.split("=", 1)[0].strip(): item.split("=", 1)[1].strip() if "=" in item else "" for item in cookie_items}
15
+ return parsed_cookies#
16
+
17
+ # 获取域名字符串的根域
18
+ def get_root_domain(url):
19
+ parsed_url = urlparse(url)
20
+ domain = parsed_url.netloc
21
+
22
+ parts = domain.split('.')
23
+ if len(parts) > 1:
24
+ return '.'.join(parts[-2:])
25
+ else:
26
+ return domain
27
+
28
+ # 尝试对字符串做 json 解析,如果失败则返回原字符串
29
+ def try_json_decode(headers):
30
+ try:
31
+ return json.loads(str(headers))
32
+ except Exception as e:
33
+ return headers
34
+
35
+ @app.get("/")
36
+ def main():
37
+ return {"code": 200,"msg":"Success"}
38
+
39
+ @app.get("/chrome")
40
+ def chrome(url:str=None,wait:int=5,header:str=None,cookie:str=None,cookie_domain:str=None):
41
+ # 开启捕获HAR数据功能,允许使用 driver.har 进行检索
42
+ seleniumwire_options = {
43
+ 'enable_har': True
44
+ }
45
+
46
+ # 必须有目标url
47
+ if type(url) == str:
48
+ target_url = unquote(url)
49
+ target_domain = get_root_domain(target_url)
50
+ else:
51
+ return {"code": 500,"msg":"No target URL"}
52
+
53
+ # 等待时间必须在 0 到 30 之间
54
+ if wait in range(0, 31):
55
+ wait_time = wait
56
+ else:
57
+ return {"code": 500,"msg":"The waiting time must be between 0 and 30"}
58
+
59
+ header_array = {}
60
+
61
+ # header可以覆写,但必须传入json
62
+ try:
63
+ if type(header) == str:
64
+ header_array.update(json.loads(unquote(header)))
65
+ except Exception as e:
66
+ return {"code": 500,"msg":"The header field is not JSON"}
67
+
68
+ # 如果输入了cookie
69
+ if type(cookie) == str:
70
+ header_array.update({"cookie":unquote(cookie)})
71
+
72
+ # 初始化浏览器
73
+ options = Options()
74
+
75
+ # 设置为无头模式
76
+ options.add_argument('--headless')
77
+
78
+ # 实例化
79
+ driver = webdriver.Chrome(options=options,seleniumwire_options=seleniumwire_options)
80
+
81
+ # 需要打开网址页面,才能用 driver.add_cookie 进行cookie追加
82
+ driver.get(target_url)
83
+
84
+ # 清除本次打开网址页面,可能存储在本地的cookie、sessionStorage、localStorage,并删除因此次访问所产生的 network 和 har 记录
85
+ driver.delete_all_cookies()
86
+ driver.execute_script("window.sessionStorage.clear();")
87
+ driver.execute_script("window.localStorage.clear();")
88
+ del driver.requests
89
+
90
+ # 对浏览器追加我们传递进来的cookie
91
+ if 'cookie' in header_array:
92
+ cookie_array = convert_cookies_to_dict(header_array['cookie'])
93
+ del header_array['cookie']
94
+
95
+ if type(cookie_domain) == str:
96
+ domain = cookie_domain
97
+ else:
98
+ domain = f'.{target_domain}'
99
+
100
+ for key, value in cookie_array.items():
101
+ try:
102
+ driver.add_cookie({"name": key, "value": quote_plus(value), "domain": domain, "path": "/"})
103
+ except Exception as e:
104
+ print("Error Cookie:")
105
+ print({"name": key, "value": quote_plus(value), "domain": domain, "path": "/"})
106
+
107
+ # 把下次访问中的请求头修改成我们需要的样式(没有修改的项目则保持原样)
108
+ driver.header_overrides = header_array
109
+
110
+ # 再次访问网址
111
+ driver.get(target_url)
112
+
113
+ # 输出此时访问的网页源码
114
+ # print(driver.page_source)
115
+
116
+ # 等待多少秒,来预估网页完全的加载完成(执行完内部的所有js,因为部分js可能涉及到请求后的动态处理,或者延时跳转)
117
+ if wait_time > 0:
118
+ time.sleep(wait_time)
119
+
120
+ # 获取完全加载完成时,页面的URL
121
+ current_url = driver.current_url
122
+
123
+ # 获取完全加载完成时,页面的源代码
124
+ page_source = driver.page_source
125
+
126
+ # 获取完全加载完成时,页面的cookie
127
+ cookies = driver.get_cookies()
128
+
129
+ # 完全加载完成时,页面是否有发生过 301 302 跳转过
130
+ is_jump = (target_url != current_url)
131
+
132
+ network = []
133
+ # 遍历输出过程中的 network(使用非 har 文件的摘要方式输出)
134
+ for request in driver.requests:
135
+ if request.response:
136
+ network.append({
137
+ "method":request.method,
138
+ "status":request.response.status_code ,
139
+ "url":request.url,
140
+ "responseheaders":{k: try_json_decode(v) for k, v in request.response.headers.items()},
141
+ "requestheaders":{k: try_json_decode(v) for k, v in request.headers.items()},
142
+ })
143
+
144
+ # driver.har 将调用 har 记录,输出最为完整的 network 数据流
145
+ # print(driver.har)
146
+
147
+ data = {
148
+ "url": current_url,
149
+ "page_source": page_source,
150
+ "end_cookies": cookies,
151
+ "is_jump": is_jump,
152
+ "network": network
153
+ }
154
+
155
+ driver.quit()
156
+ return {"code": 200,"data":data}
157
+
158
+ if __name__ == '__main__':
159
+ uvicorn.run(app='app:app', host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ selenium
4
+ selenium-wire
5
+ blinker==1.7.0