Spaces:
Paused
Paused
Update main.py
Browse files
main.py
CHANGED
@@ -1,11 +1,7 @@
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
-
|
5 |
-
from AWSClaude import AWSClaude
|
6 |
-
import json
|
7 |
-
import concurrent.futures
|
8 |
-
import time
|
9 |
|
10 |
app = FastAPI()
|
11 |
app.add_middleware(
|
@@ -20,120 +16,7 @@ app.add_middleware(
|
|
20 |
|
21 |
@app.post("/get_n_depth_results")
|
22 |
async def get_n_depth_results(url,input_query):
|
23 |
-
|
24 |
-
|
25 |
-
def add_pdf_content(selected_pdf):
|
26 |
-
for pdf_url in selected_pdf:
|
27 |
-
print(pdf_url)
|
28 |
-
response = requests.get(pdf_url)
|
29 |
-
|
30 |
-
# Save the content of the response as a PDF file
|
31 |
-
pdf_path = "temp.pdf"
|
32 |
-
with open(pdf_path, "wb") as file:
|
33 |
-
file.write(response.content)
|
34 |
-
|
35 |
-
print(f"PDF file saved as {pdf_path}")
|
36 |
-
|
37 |
-
url = "http://localhost:5000/ask"
|
38 |
-
# url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2"
|
39 |
-
|
40 |
-
data = {"processTables": "True"}
|
41 |
-
|
42 |
-
headers = {"Origin": "http://localhost:8080"}
|
43 |
-
|
44 |
-
with open(pdf_path, "rb") as file:
|
45 |
-
file_contents = file.read()
|
46 |
-
|
47 |
-
files = {
|
48 |
-
"pdf": (
|
49 |
-
pdf_path,
|
50 |
-
file_contents,
|
51 |
-
"application/pdf",
|
52 |
-
)
|
53 |
-
}
|
54 |
-
|
55 |
-
response = requests.post(url, files=files, data=data, headers=headers)
|
56 |
-
all_content[pdf_url] = response.json()
|
57 |
-
|
58 |
-
def scrapper(input_url):
|
59 |
-
params = {'url': input_url}
|
60 |
-
headers = {'accept': 'application/json'}
|
61 |
-
url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data'
|
62 |
-
try:
|
63 |
-
response = requests.get(url, headers=headers, params=params)
|
64 |
-
all_url = response.json()["URL"]
|
65 |
-
all_content[input_url] = response.json()["Content"]
|
66 |
-
return all_url
|
67 |
-
except:
|
68 |
-
print(f"found a error url : {input_url}=========================================")
|
69 |
-
return "none"
|
70 |
-
|
71 |
-
pdf_urls = []
|
72 |
-
|
73 |
-
def separate_pdf_and_nonPDF_links(urls):
|
74 |
-
# Separate URLs into two lists
|
75 |
-
pdf_links = [url for url in urls if url and url.endswith('.pdf')]
|
76 |
-
if pdf_links:
|
77 |
-
pdf_urls.append(pdf_links)
|
78 |
-
return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping
|
79 |
-
|
80 |
-
def call_llm_service(scraped_data, input_url, input_query, pdf):
|
81 |
-
query = f"""
|
82 |
-
Here are my scraped links:
|
83 |
-
|
84 |
-
{scraped_data}
|
85 |
-
|
86 |
-
correct hostname: {input_url} use this host name for all other tasks
|
87 |
-
|
88 |
-
I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output
|
89 |
-
example json format is only links don't include keys (i need the always full (www.hotname.com/../))
|
90 |
-
"""
|
91 |
-
llm = "ClaudeHaiku"
|
92 |
-
env = ""
|
93 |
-
user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA"
|
94 |
-
thread_id = "hKxvoVgi7vRJCHhvMzH5"
|
95 |
-
stream_id = "stream1"
|
96 |
-
app_type = "sentinel"
|
97 |
-
other_request_params = {"messages": [
|
98 |
-
{"role": "user", "content": query},
|
99 |
-
]}
|
100 |
-
return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke()
|
101 |
-
|
102 |
-
input_url = f'["{url}"]'
|
103 |
-
input_query = input_query
|
104 |
-
|
105 |
-
for step in range(1, 3):
|
106 |
-
print(f"=================={step} step of scraping to get selected URLs from LLM=================================")
|
107 |
-
next_urls = []
|
108 |
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
109 |
-
futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])]
|
110 |
-
for future in concurrent.futures.as_completed(futures):
|
111 |
-
next_urls.append(separate_pdf_and_nonPDF_links(future.result()))
|
112 |
-
selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "")
|
113 |
-
input_url = selected_links_from_llm
|
114 |
-
print(json.loads(input_url)[:2])
|
115 |
-
|
116 |
-
if not pdf_urls:
|
117 |
-
print(pdf_urls)
|
118 |
-
#return all_content.keys()
|
119 |
-
return all_content
|
120 |
-
else:
|
121 |
-
selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension"))
|
122 |
-
print(pdf_urls)
|
123 |
-
print("selected pdf")
|
124 |
-
print(selected_pdf)
|
125 |
-
#return all_content.keys()
|
126 |
-
return all_content
|
127 |
-
|
128 |
-
# # Start time
|
129 |
-
# start_time = time.time()
|
130 |
-
|
131 |
-
# print(main("https://www.keells.com/", "Please analyse reports"))
|
132 |
-
|
133 |
-
# # End time
|
134 |
-
# end_time = time.time()
|
135 |
-
|
136 |
-
# # Calculate the time taken
|
137 |
-
# time_taken = end_time - start_time
|
138 |
|
139 |
-
|
|
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
+
|
|
|
|
|
|
|
|
|
5 |
|
6 |
app = FastAPI()
|
7 |
app.add_middleware(
|
|
|
16 |
|
17 |
@app.post("/get_n_depth_results")
|
18 |
async def get_n_depth_results(url,input_query):
|
19 |
+
return "done"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
|
22 |
+
|