Upload 19 files
Browse files- Daily_Star_fully_scraped.py +79 -0
- Dhaka_Tribune_Fully_Scraped.py +82 -0
- LLM_automation_GPT.py +125 -0
- LLM_automation_GPT35.py +91 -0
- LLM_automation_Groq.py +130 -0
- Prothom_alo_fully_scraped.py +80 -0
- animate.json +0 -0
- app.py +85 -0
- buet.png +0 -0
- langchain.png +0 -0
- llama3.jpeg +0 -0
- numpy.png +0 -0
- openai.png +0 -0
- packages.txt +1 -0
- pandas.png +0 -0
- requirements.txt +14 -0
- selenium.png +0 -0
- selenium_webdriver.jpeg +0 -0
- streamlit.png +0 -0
Daily_Star_fully_scraped.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_data(number):
|
2 |
+
|
3 |
+
|
4 |
+
##Necessary imports
|
5 |
+
from selenium import webdriver
|
6 |
+
from selenium.webdriver import chrome
|
7 |
+
from selenium.webdriver import ChromeOptions
|
8 |
+
options = ChromeOptions()
|
9 |
+
options.add_argument("--headless=new")
|
10 |
+
driver = webdriver.Chrome(options=options)
|
11 |
+
## Finding Elements by XPATH
|
12 |
+
from selenium.webdriver.common.by import By
|
13 |
+
|
14 |
+
driver.get("https://www.thedailystar.net/tags/road-accident")
|
15 |
+
|
16 |
+
"""
|
17 |
+
Browsing with browser open codes:
|
18 |
+
##Necessary imports
|
19 |
+
from selenium import webdriver
|
20 |
+
from selenium.webdriver import chrome
|
21 |
+
|
22 |
+
driver = webdriver.Chrome()
|
23 |
+
## Finding Elements by XPATH
|
24 |
+
from selenium.webdriver.common.by import By
|
25 |
+
driver.get("https://en.prothomalo.com/topic/Road-accident")
|
26 |
+
"""
|
27 |
+
import time
|
28 |
+
news_list=[]
|
29 |
+
news_link=[]
|
30 |
+
publish_date=[]
|
31 |
+
for i in range(number):
|
32 |
+
#time.sleep(5)
|
33 |
+
if (i+1)!=0 and (i+1)%10==0:
|
34 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
35 |
+
driver.execute_script(f"window.scrollTo(0, {last_height-950})")
|
36 |
+
driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[2]/ul/li/a').click()
|
37 |
+
time.sleep(10)
|
38 |
+
txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/div[2]/div[2]/h3/a')
|
39 |
+
publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/div[1]').text)
|
40 |
+
news_list.append(txt.text)
|
41 |
+
news_link.append(txt.get_attribute("href"))
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
#### Converting the list to a pandas dataframe by converting the list to a dictionary ###
|
46 |
+
dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
|
47 |
+
import pandas as pd
|
48 |
+
df=pd.DataFrame(dict)
|
49 |
+
|
50 |
+
|
51 |
+
############################################### Description Exctraction #################################################
|
52 |
+
from newspaper import Article
|
53 |
+
|
54 |
+
|
55 |
+
text=[]
|
56 |
+
for i in range(len(df)):
|
57 |
+
url = df['News Link'][i]
|
58 |
+
article = Article(url)
|
59 |
+
article.download()
|
60 |
+
article.parse()
|
61 |
+
|
62 |
+
text.append(article.text)
|
63 |
+
|
64 |
+
df2=df.assign(Description=text)
|
65 |
+
|
66 |
+
|
67 |
+
for p in range(len(df2)):
|
68 |
+
if df2['Publish Date'][p]=="Not available":
|
69 |
+
df2.drop([p],inplace=True)
|
70 |
+
#df2.reset_index()
|
71 |
+
|
72 |
+
df2.reset_index(drop=True,inplace=True)
|
73 |
+
|
74 |
+
df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
return df2
|
79 |
+
#df3.to_csv('Prothom_Alo_Description.txt', index=False)
|
Dhaka_Tribune_Fully_Scraped.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_data(number):
|
2 |
+
##Necessary imports
|
3 |
+
from selenium import webdriver
|
4 |
+
from selenium.webdriver import chrome
|
5 |
+
from selenium.webdriver import ChromeOptions
|
6 |
+
import math
|
7 |
+
options = ChromeOptions()
|
8 |
+
options.add_argument("--headless=new")
|
9 |
+
driver = webdriver.Chrome(options=options)
|
10 |
+
## Finding Elements by XPATH
|
11 |
+
from selenium.webdriver.common.by import By
|
12 |
+
|
13 |
+
|
14 |
+
driver.get("https://www.dhakatribune.com/topic/road-accident")
|
15 |
+
|
16 |
+
#### Scraping News Title and News Link ####
|
17 |
+
import time
|
18 |
+
news_list=[]
|
19 |
+
news_link=[]
|
20 |
+
publish_date=[]
|
21 |
+
row_counter=0
|
22 |
+
news_counter=0
|
23 |
+
for i in range(number):
|
24 |
+
if i==0:
|
25 |
+
row_counter=1
|
26 |
+
else:
|
27 |
+
row_counter=math.ceil(i/4)
|
28 |
+
news_counter=i%4+1
|
29 |
+
#time.sleep(5)
|
30 |
+
if (i+1)!=0 and (i+1)%20==0:
|
31 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
32 |
+
driver.execute_script(f"window.scrollTo(0, {last_height})")
|
33 |
+
driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
|
34 |
+
time.sleep(10)
|
35 |
+
txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2/a')
|
36 |
+
#publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
|
37 |
+
news_list.append(txt.text)
|
38 |
+
news_link.append(txt.get_attribute("href"))
|
39 |
+
|
40 |
+
###### Scraping Publish Date ######
|
41 |
+
publish_date=[]
|
42 |
+
for i in range (len(news_link)):
|
43 |
+
driver.get(news_link[i])
|
44 |
+
time.sleep(6)
|
45 |
+
driver.execute_script("window.stop();")
|
46 |
+
try:
|
47 |
+
publish_date.append(driver.find_element('xpath','/html/body/div[3]/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/div/div/div/div/div/div[2]/div/div/div[3]/div/div[1]/div/div[2]/span[1]').text)
|
48 |
+
except:
|
49 |
+
publish_date.append("Not available")
|
50 |
+
|
51 |
+
#### Converting the list to a pandas dataframe by converting the list to a dictionary ###
|
52 |
+
dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
|
53 |
+
import pandas as pd
|
54 |
+
df=pd.DataFrame(dict)
|
55 |
+
|
56 |
+
|
57 |
+
############################################ Description Extraction ###################################################
|
58 |
+
|
59 |
+
from newspaper import Article
|
60 |
+
text=[]
|
61 |
+
for i in range(len(df)):
|
62 |
+
url = df['News Link'][i]
|
63 |
+
article = Article(url)
|
64 |
+
article.download()
|
65 |
+
article.parse()
|
66 |
+
|
67 |
+
text.append(article.text)
|
68 |
+
|
69 |
+
|
70 |
+
df2=df.assign(Description=text)
|
71 |
+
for p in range(len(df2)):
|
72 |
+
if df2['Publish Date'][p]=="Not available":
|
73 |
+
df2.drop([p],inplace=True)
|
74 |
+
|
75 |
+
df2.reset_index(drop=True,inplace=True)
|
76 |
+
df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
return df2
|
81 |
+
|
82 |
+
#df3.to_csv('Dhaka_Tribune_Description.txt', index=False)
|
LLM_automation_GPT.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def create_data(description):
|
2 |
+
print("Running THis Script")
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate ### To create a chatbot, chatprompttemplate used
|
4 |
+
from langchain_openai import ChatOpenAI ##### For using chat openai features
|
5 |
+
from langchain_core.output_parsers import StrOutputParser ### Default output parser. Custom parser can also be created
|
6 |
+
from langchain_community.llms import ollama ### Importing ollama
|
7 |
+
|
8 |
+
|
9 |
+
import os
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
import pandas as pd
|
12 |
+
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
### Set all api keys:
|
17 |
+
os.environ["OPENAI_API_KEY"]="sk-proj-ZB9b6Gn2FccVRsaL9WYfT3BlbkFJDpUpcoUwyR9LPoIJuAVl"
|
18 |
+
### Create Prompt Template:
|
19 |
+
prompt=ChatPromptTemplate.from_messages(
|
20 |
+
{
|
21 |
+
("system", "You are a helpful assistant, please respond to the queries"), ### We need both system and users in prompt
|
22 |
+
("user","question: {question}")
|
23 |
+
}
|
24 |
+
)
|
25 |
+
|
26 |
+
#### Create OpenAI llm:
|
27 |
+
llm=ChatOpenAI(model="gpt-4o")
|
28 |
+
|
29 |
+
### Create an output parser:
|
30 |
+
output_parser=StrOutputParser()
|
31 |
+
|
32 |
+
#### Creating chain: The concept is- output of action before | symbol will be passed as input in action after the symbol.
|
33 |
+
#### Here we have created three actions: The prompt, llm and output parser:
|
34 |
+
chain=prompt|llm|output_parser
|
35 |
+
|
36 |
+
df = description
|
37 |
+
df = df.fillna(0)
|
38 |
+
dj=[]
|
39 |
+
|
40 |
+
for i in range(len(df)):
|
41 |
+
dj.append(chain.invoke({"question" : df['Date + Desc'][i]+" Is the news referring to one or many specific accident incidents or accident in general? Make sure that your answer is only in one word. If a report contains more than one accident incident, classify it as a general accident incident. The word should be either 'Specific' or 'General'. Your answer should not contain any words except 'Specific' and 'General' "}))
|
42 |
+
|
43 |
+
df2=df.copy()
|
44 |
+
df2['Report Type']=dj
|
45 |
+
def drp(p):
|
46 |
+
df2.drop([p],inplace=True)
|
47 |
+
|
48 |
+
### Removing the general accident types:
|
49 |
+
for p in range(len(df)):
|
50 |
+
if "General" in df2['Report Type'][p]:
|
51 |
+
drp(p)
|
52 |
+
|
53 |
+
### Reseting index of df3:
|
54 |
+
df2.reset_index(drop=True,inplace=True)
|
55 |
+
|
56 |
+
### Now finding column values using llm:
|
57 |
+
### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
|
58 |
+
def res(i):
|
59 |
+
response=chain.invoke({"question" : df2['Description'][i]+f"""Provide only the answers of the following question seperated by a comma only:
|
60 |
+
If the news was published on {df2['Publish Date'][i]}, what is the date of accident occurrence? The date must be in Day-Month-Year format. Be careful because publish date and accident occurrence date may or may not be the same. Try to deduce correct accident date and do not include Saturday Sunday etc in your date. Only numerics are needed,
|
61 |
+
Time of Accident occured, How many people were killed in the accident in numeric number?,
|
62 |
+
How many people were injured in the accident in numeric number?,
|
63 |
+
Location of the accident,
|
64 |
+
Type of road where accident occured,
|
65 |
+
Was there any pedestrian involved?,
|
66 |
+
Do not include any other sentences except the answers seperated by comma only,
|
67 |
+
if you cannot find or deduce a answer simply put 'Not Available' in place of it.
|
68 |
+
If a report mentions more than one specific accident incidents only consider the 1st accident incident and ignore the second one""" })
|
69 |
+
return response
|
70 |
+
#### dj2 list contains all column values seperated by comma:
|
71 |
+
dj2=[]
|
72 |
+
|
73 |
+
for i in range(len(df2)):
|
74 |
+
dj2.append(res(i))
|
75 |
+
### Finding vehicle
|
76 |
+
def res2(i):
|
77 |
+
response=chain.invoke({"question" : df2['Date + Desc'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
|
78 |
+
return response
|
79 |
+
#### vehicle list contains all vehicles involved:
|
80 |
+
vehicles=[]
|
81 |
+
|
82 |
+
for i in range(len(df2)):
|
83 |
+
vehicles.append(res2(i))
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
### Splitting dj2 string based on comma position:
|
89 |
+
Date=[]
|
90 |
+
Time=[]
|
91 |
+
Killed=[]
|
92 |
+
Injured=[]
|
93 |
+
Location=[]
|
94 |
+
Road_Characteristic=[]
|
95 |
+
Pedestrian_Involved=[]
|
96 |
+
#Vehicles_involved=[]
|
97 |
+
|
98 |
+
for i in range(len(dj2)):
|
99 |
+
words = dj2[i].split(",") # Splitting at the comma delimiter
|
100 |
+
#print(f"Date: {words[0]}")
|
101 |
+
Date.append(words[0])
|
102 |
+
|
103 |
+
#print(f"Time: {words[1]}")
|
104 |
+
Time.append(words[1])
|
105 |
+
|
106 |
+
#print(f"Casualities: {words[2]}")
|
107 |
+
Killed.append(words[2])
|
108 |
+
Injured.append(words[3])
|
109 |
+
Location.append(words[4])
|
110 |
+
Road_Characteristic.append(words[5])
|
111 |
+
Pedestrian_Involved.append(words[6])
|
112 |
+
#Vehicles_involved.append(words[7])
|
113 |
+
|
114 |
+
#### Probable type of final dataframe:
|
115 |
+
df2["Accident Date"]=Date
|
116 |
+
df2["Time"]=Time
|
117 |
+
df2["Killed"]=Killed
|
118 |
+
df2["Injured"]=Injured
|
119 |
+
df2["Location"]=Location
|
120 |
+
df2["Road_Characteristic"]=Road_Characteristic
|
121 |
+
df2["Pedestrian_Involved"]=Pedestrian_Involved
|
122 |
+
df2["Vehicles Involved"]=vehicles
|
123 |
+
df3=df2.drop(columns=['Description','Date + Desc','Report Type'])
|
124 |
+
return df3
|
125 |
+
|
LLM_automation_GPT35.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def create_data(description):
|
2 |
+
from langchain_core.prompts import ChatPromptTemplate ### To create a chatbot, chatprompttemplate used
|
3 |
+
from langchain_openai import ChatOpenAI ##### For using chat openai features
|
4 |
+
from langchain_core.output_parsers import StrOutputParser ### Default output parser. Custom parser can also be created
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
import os
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
### Set all api keys:
|
15 |
+
os.environ["OPENAI_API_KEY"]="sk-proj-ZB9b6Gn2FccVRsaL9WYfT3BlbkFJDpUpcoUwyR9LPoIJuAVl"
|
16 |
+
|
17 |
+
|
18 |
+
### Create Prompt Template:
|
19 |
+
prompt=ChatPromptTemplate.from_messages(
|
20 |
+
{
|
21 |
+
("system", "You are a helpful assistant, please respond to the queries"), ### We need both system and users in prompt
|
22 |
+
("user","question: {question}")
|
23 |
+
}
|
24 |
+
)
|
25 |
+
df2=description
|
26 |
+
#### Create OpenAI llm:
|
27 |
+
llm=ChatOpenAI(model="gpt-3.5-turbo")
|
28 |
+
|
29 |
+
### Create an output parser:
|
30 |
+
output_parser=StrOutputParser()
|
31 |
+
|
32 |
+
#### Creating chain: The concept is- output of action before | symbol will be passed as input in action after the symbol.
|
33 |
+
#### Here we have created three actions: The prompt, llm and output parser:
|
34 |
+
chain=prompt|llm|output_parser
|
35 |
+
|
36 |
+
### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
|
37 |
+
def res(i):
|
38 |
+
response=chain.invoke({"question" : df2['Description'][i]+" Is the news referring to a specific accident incident or accident in general? Answer only in a word: 'specific' or 'general'. No other words are allowed in your answer"})
|
39 |
+
return response
|
40 |
+
|
41 |
+
#### dj list contains type of report 'General' or 'Specific'
|
42 |
+
dj=[]
|
43 |
+
|
44 |
+
for i in range(len(df2)):
|
45 |
+
dj.append(res(i))
|
46 |
+
|
47 |
+
df2['Report Type']=dj
|
48 |
+
|
49 |
+
def drp(p):
|
50 |
+
df2.drop([p],inplace=True)
|
51 |
+
### Removing the general accident types:
|
52 |
+
for p in range(len(df2)):
|
53 |
+
if "General" in df2['Report Type'][p] or "general" in df2['Report Type'][p]:
|
54 |
+
drp(p)
|
55 |
+
|
56 |
+
### Reseting index of df3:
|
57 |
+
df2.reset_index(drop=True,inplace=True)
|
58 |
+
|
59 |
+
|
60 |
+
### Splitting dj2 string based on comma position:
|
61 |
+
Date=[]
|
62 |
+
Time=[]
|
63 |
+
Killed=[]
|
64 |
+
Injured=[]
|
65 |
+
Location=[]
|
66 |
+
Road_Characteristic=[]
|
67 |
+
Pedestrian_Involved=[]
|
68 |
+
vehicles=[]
|
69 |
+
#Weather=[]
|
70 |
+
|
71 |
+
for i in range(len(df2)):
|
72 |
+
Date.append(chain.invoke({"question" : "Read the accident report carefully and provide only the answer of the question asked. Do not add any extra sentences or words except the answer: What is the date of accident occurrence in Day-Month-Year format. Keep in mind that news publish date and accident occurrence date may be different. If you cannot find or deduce the answer, simply reply Not Available" + df2['Description'][i]}))
|
73 |
+
Time.append(chain.invoke({"question" : "Read the accident report carefully and provide only the answer of the question asked. Do not add any extra sentences or words except the answer: What is the time of accident occurrence in 24-hour format. If you cannot find or deduce the answer, simply reply Not Available" + df2['Description'][i]}))
|
74 |
+
Killed.append(chain.invoke({"question" : "Read the accident report carefully and provide only the answer of the question asked. Do not add any extra sentences or words except the answer: How many people were killed in the accident?. If you cannot find or deduce the answer, simply reply Not Available" + df2['Description'][i]}))
|
75 |
+
Injured.append(chain.invoke({"question" : "Read the accident report carefully and provide only the answer of the question asked. Do not add any extra sentences or words except the answer: How many people were injured in the accident?. If you cannot find or deduce the answer, simply reply Not Available" + df2['Description'][i]}))
|
76 |
+
Location.append(chain.invoke({"question" : "Read the accident report carefully and provide only the answer of the question asked. Do not add any extra sentences or words except the answer: What is the name of the location where accident took place?. If you cannot find or deduce the answer, simply reply Not Available" + df2['Description'][i]}))
|
77 |
+
Road_Characteristic.append(chain.invoke({"question" : "Read the accident report carefully and provide only the answer of the question asked. Do not add any extra sentences or words except the answer: What is the type of road where accident took place?. If you cannot find or deduce the answer, simply reply Not Available" + df2['Description'][i]}))
|
78 |
+
Pedestrian_Involved.append(chain.invoke({"question" : "Read the accident report carefully and provide only the answer of the question asked. Do not add any extra sentences or words except the answer: Was there any pedestrian involved in the accident?. If you cannot find or deduce the answer, simply reply Not Available" + df2['Description'][i]}))
|
79 |
+
vehicles.append(chain.invoke({"question" : "Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences" + df2['Description'][i]}))
|
80 |
+
|
81 |
+
#### Probable type of final dataframe:
|
82 |
+
df2["Date"]=Date
|
83 |
+
df2["Time"]=Time
|
84 |
+
df2["Killed"]=Killed
|
85 |
+
df2["Injured"]=Injured
|
86 |
+
df2["Location"]=Location
|
87 |
+
df2["Road_Characteristic"]=Road_Characteristic
|
88 |
+
df2["Pedestrian_Involved"]=Pedestrian_Involved
|
89 |
+
df2["Vehicles Involved"]=vehicles
|
90 |
+
df3=df2.drop(columns=['Description','Report Type','Date + Desc'])
|
91 |
+
return df3
|
LLM_automation_Groq.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def create_data(description):
|
2 |
+
from langchain_core.prompts import ChatPromptTemplate ### To create a chatbot, chatprompttemplate used
|
3 |
+
|
4 |
+
from langchain_core.output_parsers import StrOutputParser ### Default output parser. Custom parser can also be created
|
5 |
+
from langchain_groq import ChatGroq
|
6 |
+
|
7 |
+
|
8 |
+
import os
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
### Set all api keys:
|
15 |
+
|
16 |
+
#os.environ["LANGCHAIN_TRACING_V2"]="true" ### Will automatically trace our codes using Langsmith
|
17 |
+
os.environ["GROQ_API_KEY"]="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" #### Will be used for monitoring the calls to and from llm (both free and paid)
|
18 |
+
|
19 |
+
### Create Prompt Template:
|
20 |
+
prompt=ChatPromptTemplate.from_messages(
|
21 |
+
{
|
22 |
+
("system", "You are a helpful assistant, please respond to the queries"), ### We need both system and users in prompt
|
23 |
+
("user","question: {question}")
|
24 |
+
}
|
25 |
+
)
|
26 |
+
|
27 |
+
#### Create LLama3 70B llm:
|
28 |
+
llm = ChatGroq(
|
29 |
+
model="llama3-70b-8192"
|
30 |
+
) # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3 `
|
31 |
+
|
32 |
+
|
33 |
+
### Create an output parser:
|
34 |
+
output_parser=StrOutputParser()
|
35 |
+
|
36 |
+
#### Creating chain: The concept is- output of action before | symbol will be passed as input in action after the symbol.
|
37 |
+
#### Here we have created three actions: The prompt, llm and output parser:
|
38 |
+
chain=prompt|llm|output_parser
|
39 |
+
|
40 |
+
df = description
|
41 |
+
df = df.fillna(0)
|
42 |
+
dj=[]
|
43 |
+
|
44 |
+
for i in range(len(df)):
|
45 |
+
dj.append(chain.invoke({"question" : df['Date + Desc'][i]+" Is the news referring to one or many specific accident incidents or accident in general? Make sure that your answer is only in one word. If a report contains more than one accident incident, classify it as a general accident incident. The word should be either 'Specific' or 'General'. Your answer should not contain any words except 'Specific' and 'General' "}))
|
46 |
+
|
47 |
+
df2=df.copy()
|
48 |
+
df2['Report Type']=dj
|
49 |
+
def drp(p):
|
50 |
+
df2.drop([p],inplace=True)
|
51 |
+
|
52 |
+
### Removing the general accident types:
|
53 |
+
for p in range(len(df)):
|
54 |
+
if "General" in df2['Report Type'][p]:
|
55 |
+
drp(p)
|
56 |
+
|
57 |
+
### Reseting index of df3:
|
58 |
+
df2.reset_index(drop=True,inplace=True)
|
59 |
+
|
60 |
+
### Now finding column values using llm:
|
61 |
+
### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
|
62 |
+
def res(i):
|
63 |
+
response=chain.invoke({"question" : df2['Description'][i]+f"""Provide only the answers of the following question seperated by a comma only:
|
64 |
+
If the news was published on {df2['Publish Date'][i]}, what is the date of accident occurrence? The date must be in Day-Month-Year format. Be careful because publish date and accident occurrence date may or may not be the same. Try to deduce correct accident date,
|
65 |
+
Time of Accident occured, How many people were killed in the accident in numeric number?,
|
66 |
+
How many people were injured in the accident in numeric number?,
|
67 |
+
Location of the accident,
|
68 |
+
Type of road where accident occured,
|
69 |
+
Was there any pedestrian involved?,
|
70 |
+
Do not include any other sentences except the answers seperated by comma only and do not include sentences such as: Here are the answers,
|
71 |
+
if you cannot find or deduce a answer simply put 'Not Available' in place of it.
|
72 |
+
If a report mentions more than one specific accident incidents only consider the 1st accident incident and ignore the second one""" })
|
73 |
+
return response
|
74 |
+
#### dj2 list contains all column values seperated by comma:
|
75 |
+
dj2=[]
|
76 |
+
|
77 |
+
for i in range(len(df2)):
|
78 |
+
dj2.append(res(i))
|
79 |
+
|
80 |
+
### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
|
81 |
+
def res2(i):
|
82 |
+
response=chain.invoke({"question" : df2['Date + Desc'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
|
83 |
+
return response
|
84 |
+
#### dj2 list contains all column values seperated by comma:
|
85 |
+
vehicles=[]
|
86 |
+
|
87 |
+
for i in range(len(df2)):
|
88 |
+
vehicles.append(res2(i))
|
89 |
+
|
90 |
+
|
91 |
+
### Splitting dj2 string based on comma position:
|
92 |
+
Date=[]
|
93 |
+
Time=[]
|
94 |
+
Killed=[]
|
95 |
+
Injured=[]
|
96 |
+
Location=[]
|
97 |
+
Road_Characteristic=[]
|
98 |
+
Pedestrian_Involved=[]
|
99 |
+
#Vehicles_involved=[]
|
100 |
+
|
101 |
+
for i in range(len(dj2)):
|
102 |
+
words = dj2[i].split(",") # Splitting at the comma delimiter
|
103 |
+
#print(f"Date: {words[0]}")
|
104 |
+
Date.append(words[0])
|
105 |
+
|
106 |
+
#print(f"Time: {words[1]}")
|
107 |
+
Time.append(words[1])
|
108 |
+
|
109 |
+
#print(f"Casualities: {words[2]}")
|
110 |
+
Killed.append(words[2])
|
111 |
+
Injured.append(words[3])
|
112 |
+
Location.append(words[4])
|
113 |
+
Road_Characteristic.append(words[5])
|
114 |
+
Pedestrian_Involved.append(words[6])
|
115 |
+
#Vehicles_involved.append(words[7])
|
116 |
+
|
117 |
+
#### Probable type of final dataframe:
|
118 |
+
df2["Accident Date"]=Date
|
119 |
+
df2["Time"]=Time
|
120 |
+
df2["Killed"]=Killed
|
121 |
+
df2["Injured"]=Injured
|
122 |
+
df2["Location"]=Location
|
123 |
+
df2["Road_Characteristic"]=Road_Characteristic
|
124 |
+
df2["Pedestrian_Involved"]=Pedestrian_Involved
|
125 |
+
df2["Vehicles_involved"]=vehicles
|
126 |
+
df3=df2.drop(columns=['Description','Date + Desc','Report Type'])
|
127 |
+
return df3
|
128 |
+
|
129 |
+
|
130 |
+
|
Prothom_alo_fully_scraped.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_data(number):
|
2 |
+
print("Running Prothom_alo_fully_scraped")
|
3 |
+
##Necessary imports
|
4 |
+
from selenium import webdriver
|
5 |
+
from selenium.webdriver import chrome
|
6 |
+
from selenium.webdriver import ChromeOptions
|
7 |
+
options = ChromeOptions()
|
8 |
+
options.add_argument("--headless=new")
|
9 |
+
driver = webdriver.Chrome(options=options)
|
10 |
+
## Finding Elements by XPATH
|
11 |
+
from selenium.webdriver.common.by import By
|
12 |
+
|
13 |
+
driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)
|
14 |
+
|
15 |
+
import time
|
16 |
+
news_list=[]
|
17 |
+
news_link=[]
|
18 |
+
l=0
|
19 |
+
for i in range(number):
|
20 |
+
if i<15:
|
21 |
+
|
22 |
+
txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
|
23 |
+
news_list.append(txt.text)
|
24 |
+
news_link.append(txt.get_attribute("href"))
|
25 |
+
else:
|
26 |
+
if (i-15)%10==0:
|
27 |
+
time.sleep(5)
|
28 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
29 |
+
driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
|
30 |
+
try:
|
31 |
+
|
32 |
+
driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
|
33 |
+
except:
|
34 |
+
l=1
|
35 |
+
if l==1:
|
36 |
+
time.sleep(5)
|
37 |
+
try:
|
38 |
+
driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
|
39 |
+
except:
|
40 |
+
time.sleep(5)
|
41 |
+
driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
|
42 |
+
l=0
|
43 |
+
time.sleep(5)
|
44 |
+
txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
|
45 |
+
news_list.append(txt.text)
|
46 |
+
news_link.append(txt.get_attribute("href"))
|
47 |
+
|
48 |
+
###### Scraping Publish Date and Description ######
|
49 |
+
|
50 |
+
publish_date=[]
|
51 |
+
text=[]
|
52 |
+
for i in range (len(news_link)):
|
53 |
+
driver.get(news_link[i])
|
54 |
+
try:
|
55 |
+
publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
|
56 |
+
tmp=""
|
57 |
+
elements = driver.find_elements(By.TAG_NAME, 'p')
|
58 |
+
for e in elements:
|
59 |
+
tmp=tmp+e.text
|
60 |
+
text.append(tmp)
|
61 |
+
except:
|
62 |
+
publish_date.append("Not available")
|
63 |
+
text.append("Not Available")
|
64 |
+
time.sleep(3)
|
65 |
+
|
66 |
+
#### Converting the list to a pandas dataframe by converting the list to a dictionary ###
|
67 |
+
dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
|
68 |
+
import pandas as pd
|
69 |
+
df=pd.DataFrame(dict)
|
70 |
+
df2=df.copy()
|
71 |
+
|
72 |
+
|
73 |
+
for p in range(len(df2)):
|
74 |
+
if df2['Publish Date'][p]=="Not available":
|
75 |
+
df2.drop([p],inplace=True)
|
76 |
+
#df2.reset_index()
|
77 |
+
df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
|
78 |
+
df2.reset_index(drop=True,inplace=True)
|
79 |
+
return df2
|
80 |
+
#df3.to_csv('Prothom_Alo_Description.txt', index=False)
|
animate.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from PIL import Image
|
4 |
+
import json
|
5 |
+
from streamlit_lottie import st_lottie
|
6 |
+
|
7 |
+
##### BUET Logo ###########
|
8 |
+
image = Image.open("buet.png")
|
9 |
+
new_image = image.resize((100, 100))
|
10 |
+
#st.image(new_image)
|
11 |
+
st.title("Automated LLM and Web Scrapping based Road Accident Dataset creation from Newspapers")
|
12 |
+
|
13 |
+
|
14 |
+
######### Animation ##########
|
15 |
+
def load_lottiefile(filepath:str):
|
16 |
+
with open(filepath,"r") as f:
|
17 |
+
return json.load(f)
|
18 |
+
lottie_coding=load_lottiefile("animate.json")
|
19 |
+
st_lottie(
|
20 |
+
lottie_coding,
|
21 |
+
height=200,
|
22 |
+
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
radio_btn1=st.radio("**Choose the newspaper you want to collect news from**",options=("Prothom Alo","Dhaka Tribune","The Daily Star"))
|
27 |
+
radio_btn2=st.radio("Choose an LLM model",options=("GPT-3.5 (Medium Cost)","GPT-4 (High Cost)","Llama3 (Free)"))
|
28 |
+
|
29 |
+
number = st.number_input("**Enter the number of accident news you want the LLM to go through**",min_value=0,max_value=5000)
|
30 |
+
|
31 |
+
if st.button("Generate Dataset"):
|
32 |
+
st.write("**Please wait until the datasest is finished generating. It takes almost 8 sec to process each entry for GPT-4 and 30 sec for GPT-3.5 and Llama3. So, for example, if you entered 15 as input, it will take almost 2 minutes for GPT-4 and 7.5 min for GPT-3.5 and Llama3. The dataset will appear below.**")
|
33 |
+
|
34 |
+
if radio_btn1=="Prothom Alo":
|
35 |
+
import Prothom_alo_fully_scraped
|
36 |
+
df=Prothom_alo_fully_scraped.get_data(number)
|
37 |
+
elif radio_btn1=="Dhaka Tribune":
|
38 |
+
import Dhaka_Tribune_Fully_Scraped
|
39 |
+
df=Dhaka_Tribune_Fully_Scraped.get_data(number)
|
40 |
+
elif radio_btn1== "The Daily Star":
|
41 |
+
import Daily_Star_fully_scraped
|
42 |
+
df=Daily_Star_fully_scraped.get_data(number)
|
43 |
+
if radio_btn2=="GPT-4 (High Cost)":
|
44 |
+
import LLM_automation_GPT
|
45 |
+
df2=LLM_automation_GPT.create_data(df)
|
46 |
+
elif radio_btn2=="Llama3 (Free)":
|
47 |
+
import LLM_automation_Groq
|
48 |
+
df2=LLM_automation_Groq.create_data(df)
|
49 |
+
elif radio_btn2=="GPT-3.5 (Medium Cost)":
|
50 |
+
import LLM_automation_GPT35
|
51 |
+
df2=LLM_automation_GPT35.create_data(df)
|
52 |
+
st.dataframe(df2)
|
53 |
+
print(len(df))
|
54 |
+
|
55 |
+
|
56 |
+
#st.write("""
|
57 |
+
# **Developed by:**\n
|
58 |
+
|
59 |
+
# *MD Thamed Bin Zaman Chowdhury, Student ID: 1904184,*\n
|
60 |
+
# *Department of Civil Engineering, BUET*\n
|
61 |
+
# *E-mail: zamanthamed@gmail.com*
|
62 |
+
# """)
|
63 |
+
|
64 |
+
|
65 |
+
st.write("--------")
|
66 |
+
st.write("**Modules and packages used to develop the program:**")
|
67 |
+
|
68 |
+
######## Other Logos ################
|
69 |
+
p=125
|
70 |
+
image2 = Image.open("pandas.png")
|
71 |
+
new_image2 = image2.resize((p, p))
|
72 |
+
image3 = Image.open("numpy.png")
|
73 |
+
new_image3 = image3.resize((p, p))
|
74 |
+
image4 = Image.open("selenium_webdriver.jpeg")
|
75 |
+
new_image4 = image4.resize((p, p))
|
76 |
+
image5 = Image.open("streamlit.png")
|
77 |
+
new_image5 = image5.resize((p, p))
|
78 |
+
image6 = Image.open("openai.png")
|
79 |
+
new_image6 = image6.resize((p, p))
|
80 |
+
image7 = Image.open("llama3.jpeg")
|
81 |
+
new_image7 = image7.resize((p, p))
|
82 |
+
image8 = Image.open("langchain.png")
|
83 |
+
new_image8 = image8.resize((p, p))
|
84 |
+
|
85 |
+
st.image([new_image2, new_image3,new_image4,new_image5,new_image6,new_image7,new_image8])
|
buet.png
ADDED
![]() |
langchain.png
ADDED
![]() |
llama3.jpeg
ADDED
![]() |
numpy.png
ADDED
![]() |
openai.png
ADDED
![]() |
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
chromium
|
pandas.png
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
setuptools==70.0.0
|
2 |
+
langchain_community==0.2.6
|
3 |
+
langchain_core==0.2.10
|
4 |
+
langchain_groq==0.1.5
|
5 |
+
langchain_openai==0.1.10
|
6 |
+
newspaper3k==0.2.8
|
7 |
+
pandas==2.2.2
|
8 |
+
python-dotenv==1.0.1
|
9 |
+
selenium==4.22.0
|
10 |
+
streamlit==1.35.0
|
11 |
+
lxml==5.2.2
|
12 |
+
lxml_html_clean==0.1.1
|
13 |
+
webdriver-manager==4.0.1
|
14 |
+
streamlit-lottie==0.0.5
|
selenium.png
ADDED
![]() |
selenium_webdriver.jpeg
ADDED
![]() |
streamlit.png
ADDED
![]() |