Arafath10 commited on
Commit
3a02c2a
·
verified ·
1 Parent(s): 782b200

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +4 -121
main.py CHANGED
@@ -1,11 +1,7 @@
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
- import requests
5
- from AWSClaude import AWSClaude
6
- import json
7
- import concurrent.futures
8
- import time
9
 
10
  app = FastAPI()
11
  app.add_middleware(
@@ -20,120 +16,7 @@ app.add_middleware(
20
 
21
  @app.post("/get_n_depth_results")
22
  async def get_n_depth_results(url,input_query):
23
- all_content = {}
24
-
25
- def add_pdf_content(selected_pdf):
26
- for pdf_url in selected_pdf:
27
- print(pdf_url)
28
- response = requests.get(pdf_url)
29
-
30
- # Save the content of the response as a PDF file
31
- pdf_path = "temp.pdf"
32
- with open(pdf_path, "wb") as file:
33
- file.write(response.content)
34
-
35
- print(f"PDF file saved as {pdf_path}")
36
-
37
- url = "http://localhost:5000/ask"
38
- # url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2"
39
-
40
- data = {"processTables": "True"}
41
-
42
- headers = {"Origin": "http://localhost:8080"}
43
-
44
- with open(pdf_path, "rb") as file:
45
- file_contents = file.read()
46
-
47
- files = {
48
- "pdf": (
49
- pdf_path,
50
- file_contents,
51
- "application/pdf",
52
- )
53
- }
54
-
55
- response = requests.post(url, files=files, data=data, headers=headers)
56
- all_content[pdf_url] = response.json()
57
-
58
- def scrapper(input_url):
59
- params = {'url': input_url}
60
- headers = {'accept': 'application/json'}
61
- url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data'
62
- try:
63
- response = requests.get(url, headers=headers, params=params)
64
- all_url = response.json()["URL"]
65
- all_content[input_url] = response.json()["Content"]
66
- return all_url
67
- except:
68
- print(f"found a error url : {input_url}=========================================")
69
- return "none"
70
-
71
- pdf_urls = []
72
-
73
- def separate_pdf_and_nonPDF_links(urls):
74
- # Separate URLs into two lists
75
- pdf_links = [url for url in urls if url and url.endswith('.pdf')]
76
- if pdf_links:
77
- pdf_urls.append(pdf_links)
78
- return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping
79
-
80
- def call_llm_service(scraped_data, input_url, input_query, pdf):
81
- query = f"""
82
- Here are my scraped links:
83
-
84
- {scraped_data}
85
-
86
- correct hostname: {input_url} use this host name for all other tasks
87
-
88
- I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output
89
- example json format is only links don't include keys (i need the always full (www.hotname.com/../))
90
- """
91
- llm = "ClaudeHaiku"
92
- env = ""
93
- user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA"
94
- thread_id = "hKxvoVgi7vRJCHhvMzH5"
95
- stream_id = "stream1"
96
- app_type = "sentinel"
97
- other_request_params = {"messages": [
98
- {"role": "user", "content": query},
99
- ]}
100
- return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke()
101
-
102
- input_url = f'["{url}"]'
103
- input_query = input_query
104
-
105
- for step in range(1, 3):
106
- print(f"=================={step} step of scraping to get selected URLs from LLM=================================")
107
- next_urls = []
108
- with concurrent.futures.ThreadPoolExecutor() as executor:
109
- futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])]
110
- for future in concurrent.futures.as_completed(futures):
111
- next_urls.append(separate_pdf_and_nonPDF_links(future.result()))
112
- selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "")
113
- input_url = selected_links_from_llm
114
- print(json.loads(input_url)[:2])
115
-
116
- if not pdf_urls:
117
- print(pdf_urls)
118
- #return all_content.keys()
119
- return all_content
120
- else:
121
- selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension"))
122
- print(pdf_urls)
123
- print("selected pdf")
124
- print(selected_pdf)
125
- #return all_content.keys()
126
- return all_content
127
-
128
- # # Start time
129
- # start_time = time.time()
130
-
131
- # print(main("https://www.keells.com/", "Please analyse reports"))
132
-
133
- # # End time
134
- # end_time = time.time()
135
-
136
- # # Calculate the time taken
137
- # time_taken = end_time - start_time
138
 
139
- # print(f"Time taken: {time_taken} seconds")
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
+
 
 
 
 
5
 
6
  app = FastAPI()
7
  app.add_middleware(
 
16
 
17
  @app.post("/get_n_depth_results")
18
  async def get_n_depth_results(url,input_query):
19
+ return "done"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+
22
+