samariddin commited on
Commit
defebef
·
1 Parent(s): 42dc02f
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/Uz-NER.iml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/venv" />
6
+ </content>
7
+ <orderEntry type="jdk" jdkName="Python 3.8 (Uz-NER)" jdkType="Python SDK" />
8
+ <orderEntry type="sourceFolder" forTests="false" />
9
+ </component>
10
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="3">
8
+ <item index="0" class="java.lang.String" itemvalue="onnxruntime-gpu" />
9
+ <item index="1" class="java.lang.String" itemvalue="opencv-python" />
10
+ <item index="2" class="java.lang.String" itemvalue="imread-from-url" />
11
+ </list>
12
+ </value>
13
+ </option>
14
+ </inspection_tool>
15
+ </profile>
16
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (Uz-NER)" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/Uz-NER.iml" filepath="$PROJECT_DIR$/.idea/Uz-NER.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
app.py ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import streamlit as st
3
+ import wikipedia
4
+ from wikipedia import WikipediaPage
5
+ import pandas as pd
6
+ import spacy
7
+ import unicodedata
8
+ from nltk.corpus import stopwords
9
+ import numpy as np
10
+ import nltk
11
+ from newspaper import Article
12
+
13
+ nltk.download('stopwords')
14
+ from string import punctuation
15
+ import json
16
+ import time
17
+ from datetime import datetime, timedelta
18
+ import urllib
19
+ from io import BytesIO
20
+ from PIL import Image, UnidentifiedImageError
21
+ from SPARQLWrapper import SPARQLWrapper, JSON, N3
22
+ from fuzzywuzzy import process, fuzz
23
+ from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode
24
+ from transformers import pipeline
25
+ import en_core_web_lg
26
+
27
+ sparql = SPARQLWrapper('https://dbpedia.org/sparql')
28
+
29
+
30
+ class ExtractArticleEntities:
31
+ """ Extract article entities from a document using natural language processing (NLP) and fuzzy matching.
32
+ Parameters
33
+ - text: a string or the text of a news article to be parsed
34
+ Usage:
35
+ import ExtractArticleEntities
36
+ instantiate with text parameter ie. entities = ExtractArticleEntities(text)
37
+ retrieve Who, What, When, Where entities with entities.www_json
38
+ Non-organised entities with entiities.json
39
+ """
40
+
41
+ def __init__(self, text):
42
+ self.text = text # preprocess text at initialisation
43
+ self.text = self.preprocessing(self.text)
44
+ print(self.text)
45
+ print('_____text_____')
46
+ self.json = {}
47
+ # Create empty dataframe to hold entity data for ease of processing
48
+ self.entity_df = pd.DataFrame(columns=["entity", "description"])
49
+ # Load the spacy model
50
+
51
+ # self.nlp = en_core_web_lg.load()
52
+ self.nlp = pipeline(model="51la5/roberta-large-NER")
53
+
54
+ # Parse the text
55
+ self.entity_df = self.get_who_what_where_when()
56
+ # Disambiguate entities
57
+
58
+ self.entity_df = self.fuzzy_disambiguation()
59
+ self.get_related_entity()
60
+ self.get_popularity()
61
+ # Create JSON representation of entities
62
+ self.entity_df = self.entity_df.drop_duplicates(subset=["description"])
63
+
64
+ self.entity_df = self.entity_df.reset_index(drop=True)
65
+
66
+ # ungrouped entity returned as json
67
+ self.json = self.entity_json()
68
+ # return json with entities grouped into who, what, where, when keys
69
+ self.www_json = self.get_wwww_json()
70
+
71
+ # def get_related_entity(self):
72
+ # entities = self.entity_df.description
73
+ # labels = self.entity_df.entity
74
+ # related_entity = []
75
+ # for entity, label in zip(entities, labels):
76
+ # if label in ('PERSON', 'ORG','GPE','NORP','LOC'):
77
+ # related_entity.append(wikipedia.search(entity, 3))
78
+ # else:
79
+ # related_entity.append([None])
80
+
81
+ # self.entity_df['Wikipedia Entity'] = related_entity
82
+
83
+ def get_popularity(self):
84
+ # names = self.entity_df.description
85
+ # related_names = self.entity_df['Matched Entity']
86
+ # for name, related_name in zip(names, related_names):
87
+ # if related_name:
88
+ # related_name.append(name)
89
+ # pytrends.build_payload(related_name, timeframe='now 4-d')
90
+ # st.dataframe(pytrends.interest_over_time())
91
+ # time.sleep(2)
92
+ master_df = pd.DataFrame()
93
+ view_list = []
94
+ for entity in self.entity_df['Matched Entity']:
95
+ if entity:
96
+ entity_to_look = entity[0]
97
+ # print(entity_to_look, '_______')
98
+ entity_to_look = entity_to_look.replace(' ', '_')
99
+ print(entity_to_look, '_______')
100
+ headers = {
101
+ 'accept': 'application/json',
102
+ 'User-Agent': 'Foo bar'
103
+ }
104
+
105
+ now = datetime.now()
106
+ now_dt = now.strftime(r'%Y%m%d')
107
+ week_back = now - timedelta(days=7)
108
+ week_back_dt = week_back.strftime(r'%Y%m%d')
109
+ resp = requests.get(
110
+ f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/{entity_to_look}/daily/{week_back_dt}/{now_dt}',
111
+ headers=headers)
112
+ data = resp.json()
113
+ # print(data)
114
+ df = pd.json_normalize(data['items'])
115
+ view_count = sum(df['views'])
116
+
117
+ else:
118
+ view_count = 0
119
+ view_list.append(view_count)
120
+
121
+ self.entity_df['Views'] = view_list
122
+
123
+ for entity in ('PERSON', 'ORG', 'GPE', 'NORP', 'LOC'):
124
+ related_entity_view_list = []
125
+ grouped_df = self.entity_df[self.entity_df['entity'] == entity]
126
+ grouped_df['Matched count'] = grouped_df['fuzzy_match'].apply(len)
127
+ grouped_df['Wiki count'] = grouped_df['Matched Entity'].apply(len)
128
+
129
+ grouped_df = grouped_df.sort_values(by=['Views', 'Matched count', 'Wiki count'],
130
+ ascending=False).reset_index(drop=True)
131
+ if not grouped_df.empty:
132
+ # st.dataframe(grouped_df)
133
+ master_df = pd.concat([master_df, grouped_df])
134
+
135
+ self.sorted_entity_df = master_df
136
+ if 'Views' in self.sorted_entity_df:
137
+ self.sorted_entity_df = self.sorted_entity_df.sort_values(by=['Views'], ascending=False).reset_index(
138
+ drop=True)
139
+ # st.dataframe(self.sorted_entity_df)
140
+ # names = grouped_df['description'][:5].values
141
+ # print(names, type(names))
142
+ # if names.any():
143
+ # # pytrends.build_payload(names, timeframe='now 1-m')
144
+ # st.dataframe(pytrends.get_historical_interest(names,
145
+ # year_start=2022, month_start=10, day_start=1,
146
+ # hour_start=0,
147
+ # year_end=2022, month_end=10, day_end=21,
148
+ # hour_end=0, cat=0, geo='', gprop='', sleep=0))
149
+ # st.dataframe()
150
+ # time.sleep(2)
151
+ # st.dataframe(grouped_df)
152
+
153
+ def get_related_entity(self):
154
+ names = self.entity_df.description
155
+ entities = self.entity_df.entity
156
+ self.related_entity = []
157
+ match_scores = []
158
+ for name, entity in zip(names, entities):
159
+ if entity in ('PERSON', 'ORG', 'GPE', 'NORP', 'LOC'):
160
+ related_names = wikipedia.search(name, 10)
161
+ self.related_entity.append(related_names)
162
+ matches = process.extract(name, related_names)
163
+ match_scores.append([match[0] for match in matches if match[1] >= 90])
164
+ else:
165
+ self.related_entity.append([None])
166
+ match_scores.append([])
167
+ # Remove nulls
168
+
169
+ self.entity_df['Wikipedia Entity'] = self.related_entity
170
+ self.entity_df['Matched Entity'] = match_scores
171
+
172
+ def fuzzy_disambiguation(self):
173
+ # Load the entity data
174
+ self.entity_df['fuzzy_match'] = ''
175
+ # Load the entity data
176
+ person_choices = self.entity_df.loc[self.entity_df['entity'] == 'PERSON']
177
+ org_choices = self.entity_df.loc[self.entity_df['entity'] == 'ORG']
178
+ where_choices = self.entity_df.loc[self.entity_df['entity'] == 'GPE']
179
+ norp_choices = self.entity_df.loc[self.entity_df['entity'] == 'NORP']
180
+ loc_choices = self.entity_df.loc[self.entity_df['entity'] == 'LOC']
181
+ date_choices = self.entity_df.loc[self.entity_df['entity'] == 'DATE']
182
+
183
+ def fuzzy_match(row, choices):
184
+ '''This function disambiguates entities by looking for maximum three matches with a score of 80 or more
185
+ for each of the entity types. If there is no match, then the function returns None. '''
186
+ match = process.extract(row["description"], choices["description"], limit=3)
187
+
188
+ match = [m[0] for m in match if m[1] > 80 and m[1] != 100]
189
+
190
+ if len(match) == 0:
191
+ match = []
192
+
193
+ if match:
194
+ self.fuzzy_match_dict[row["description"]] = match
195
+
196
+ return match
197
+
198
+ # Apply the fuzzy matching function to the entity dataframe
199
+
200
+ self.fuzzy_match_dict = {}
201
+
202
+ for i, row in self.entity_df.iterrows():
203
+
204
+ if row['entity'] == 'PERSON':
205
+
206
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, person_choices)
207
+
208
+ elif row['entity'] == 'ORG':
209
+
210
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, org_choices)
211
+ elif row['entity'] == 'GPE':
212
+
213
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, where_choices)
214
+
215
+ elif row['entity'] == 'NORP':
216
+
217
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, norp_choices)
218
+ elif row['entity'] == 'LOC':
219
+
220
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, loc_choices)
221
+ elif row['entity'] == 'DATE':
222
+
223
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, date_choices)
224
+
225
+ return self.entity_df
226
+
227
+ def preprocessing(self, text):
228
+ """This function takes a text string and strips out all punctuation. It then normalizes the string to a
229
+ normalized form (using the "NFKD" normalization algorithm). Finally, it strips any special characters and
230
+ converts them to their unicode equivalents. """
231
+
232
+ # remove punctuation
233
+ text = text.translate(str.maketrans("", "", punctuation))
234
+ # normalize the text
235
+ stop_words = stopwords.words('english')
236
+
237
+ # Removing Stop words can cause losing context, instead stopwords can be utilized for knowledge
238
+ filtered_words = [word for word in self.text.split()] # if word not in stop_words]
239
+
240
+ # This is very hacky. Need a better way of handling bad encoding
241
+ pre_text = " ".join(filtered_words)
242
+ pre_text = pre_text = pre_text.replace(' ', ' ')
243
+ pre_text = pre_text.replace('’', "'")
244
+ pre_text = pre_text.replace('“', '"')
245
+ pre_text = pre_text.replace('â€', '"')
246
+ pre_text = pre_text.replace('‘', "'")
247
+ pre_text = pre_text.replace('…', '...')
248
+ pre_text = pre_text.replace('–', '-')
249
+ pre_text = pre_text.replace("\x9d", '-')
250
+ # normalize the text
251
+ pre_text = unicodedata.normalize("NFKD", pre_text)
252
+ # strip punctuation again as some remains in first pass
253
+ pre_text = pre_text.translate(str.maketrans("", "", punctuation))
254
+
255
+ return pre_text
256
+
257
+ def get_who_what_where_when(self):
258
+ """Get entity information in a document.
259
+ This function will return a DataFrame with the following columns:
260
+ - entity: the entity being queried
261
+ - description: a brief description of the entity
262
+ Usage:
263
+ get_who_what_where_when(text)
264
+ Example:
265
+ > get_who_what_where_when('This is a test')
266
+ PERSON
267
+ ORG
268
+ GPE
269
+ LOC
270
+ PRODUCT
271
+ EVENT
272
+ LAW
273
+ LANGUAGE
274
+ NORP
275
+ DATE
276
+ GPE
277
+ TIME"""
278
+
279
+ # list to hold entity data
280
+ article_entity_list = []
281
+ # tokenize the text
282
+ doc = self.nlp(self.text)
283
+ # iterate over the entities in the document but only keep those which are meaningful
284
+ desired_entities = ['PERSON', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'LANGUAGE', 'NORP', 'DATE', 'GPE',
285
+ 'TIME']
286
+ self.label_dict = {}
287
+
288
+ # stop_words = stopwords.words('english')
289
+ for ent in doc.ents:
290
+
291
+ self.label_dict[ent] = ent.label_
292
+ if ent.label_ in desired_entities:
293
+ # add the entity to the list
294
+ entity_dict = {ent.label_: ent.text}
295
+
296
+ article_entity_list.append(entity_dict)
297
+
298
+ # dedupe the entities but only on exact match of values as occasional it will assign an ORG entity to PER
299
+ deduplicated_entities = {frozenset(item.values()):
300
+ item for item in article_entity_list}.values()
301
+ # create a dataframe from the entities
302
+ for record in deduplicated_entities:
303
+ record_df = pd.DataFrame(record.items(), columns=["entity", "description"])
304
+ self.entity_df = pd.concat([self.entity_df, record_df], ignore_index=True)
305
+
306
+ print(self.entity_df)
307
+ print('______________________')
308
+ return self.entity_df
309
+
310
+ def entity_json(self):
311
+ """Returns a JSON representation of an entity defined by the `entity_df` dataframe. The `entity_json` function
312
+ will return a JSON object with the following fields:
313
+ - entity: The type of the entity in the text
314
+ - description: The name of the entity as described in the input text
315
+ - fuzzy_match: A list of fuzzy matches for the entity. This is useful for disambiguating entities that are similar
316
+ """
317
+
318
+ self.json = json.loads(self.entity_df.to_json(orient='records'))
319
+ # self.json = json.dumps(self.json, indent=2)
320
+ return self.json
321
+
322
+ def get_wwww_json(self):
323
+ """This function returns a JSON representation of the `get_who_what_where_when` function. The `get_www_json`
324
+ function will return a JSON object with the following fields:
325
+ - entity: The type of the entity in the text
326
+ - description: The name of the entity as described in the input text
327
+ - fuzzy_match: A list of fuzzy matches for the entity. This is useful for disambiguating entities that are similar
328
+ """
329
+
330
+ # create a json object from the entity dataframe
331
+ who_dict = {"who": [ent for ent in self.entity_json() if ent['entity'] in ['ORG', 'PERSON']]}
332
+ where_dict = {"where": [ent for ent in self.entity_json() if ent['entity'] in ['GPE', 'LOC']]}
333
+ when_dict = {"when": [ent for ent in self.entity_json() if ent['entity'] in ['DATE', 'TIME']]}
334
+ what_dict = {
335
+ "what": [ent for ent in self.entity_json() if ent['entity'] in ['PRODUCT', 'EVENT', 'LAW', 'LANGUAGE',
336
+ 'NORP']]}
337
+ article_wwww = [who_dict, where_dict, when_dict, what_dict]
338
+ self.wwww_json = json.dumps(article_wwww, indent=2)
339
+
340
+ return self.wwww_json
341
+
342
+
343
+ news_article = st.text_input('Paste an Article here to be parsed')
344
+ if 'parsed' not in st.session_state:
345
+ st.session_state['parsed'] = None
346
+ st.session_state['article'] = None
347
+ if news_article:
348
+ st.write('Your news article is')
349
+ st.write(news_article)
350
+
351
+ if st.button('Get details'):
352
+
353
+ parsed = ExtractArticleEntities(news_article)
354
+ if parsed:
355
+ st.session_state['article'] = parsed.sorted_entity_df
356
+ st.session_state['parsed'] = True
357
+ st.session_state['json'] = parsed.www_json
358
+
359
+
360
+ # if not st.session_state['article'].empty:
361
+
362
+ def preprocessing(text):
363
+ """This function takes a text string and strips out all punctuation. It then normalizes the string to a
364
+ normalized form (using the "NFKD" normalization algorithm). Finally, it strips any special characters and
365
+ converts them to their unicode equivalents. """
366
+
367
+ # remove punctuation
368
+ if text:
369
+ text = text.translate(str.maketrans("", "", punctuation))
370
+ # normalize the text
371
+ stop_words = stopwords.words('english')
372
+
373
+ # Removing Stop words can cause losing context, instead stopwords can be utilized for knowledge
374
+ filtered_words = [word for word in text.split()] # if word not in stop_words]
375
+
376
+ # This is very hacky. Need a better way of handling bad encoding
377
+ pre_text = " ".join(filtered_words)
378
+ pre_text = pre_text = pre_text.replace(' ', ' ')
379
+ pre_text = pre_text.replace('’', "'")
380
+ pre_text = pre_text.replace('“', '"')
381
+ pre_text = pre_text.replace('â€', '"')
382
+ pre_text = pre_text.replace('‘', "'")
383
+ pre_text = pre_text.replace('…', '...')
384
+ pre_text = pre_text.replace('–', '-')
385
+ pre_text = pre_text.replace("\x9d", '-')
386
+ # normalize the text
387
+ pre_text = unicodedata.normalize("NFKD", pre_text)
388
+ # strip punctuation again as some remains in first pass
389
+ pre_text = pre_text.translate(str.maketrans("", "", punctuation))
390
+
391
+ else:
392
+ pre_text = None
393
+ return pre_text
394
+
395
+
396
+ def filter_wiki_df(df):
397
+ key_list = df.keys()[:2]
398
+ # df.to_csv('test.csv')
399
+ df = df[key_list]
400
+ # if len(df.keys()) == 2:
401
+ df['Match Check'] = np.where(df[df.keys()[0]] != df[df.keys()[1]], True, False)
402
+
403
+ df = df[df['Match Check'] != False]
404
+ df = df[key_list]
405
+ df = df.dropna(how='any').reset_index(drop=True)
406
+ # filtered_term = []
407
+ # for terms in df[df.keys()[0]]:
408
+ # if isinstance(terms, str):
409
+ # filtered_term.append(preprocessing(terms))
410
+ # else:
411
+ # filtered_term.append(None)
412
+ # df[df.keys()[0]] = filtered_term
413
+ df.rename(columns={key_list[0]: 'Attribute', key_list[1]: 'Value'}, inplace=True)
414
+
415
+ return df
416
+
417
+
418
+ def get_entity_from_selectbox(related_entity):
419
+ entity = st.selectbox('Please select the term:', related_entity, key='foo')
420
+ if entity:
421
+ summary_entity = wikipedia.summary(entity, 3)
422
+ return summary_entity
423
+
424
+
425
+ if st.session_state['parsed']:
426
+ df = st.session_state['article']
427
+ # left, right = st.columns(2)
428
+ # with left:
429
+ df_to_st = pd.DataFrame()
430
+
431
+ df_to_st['Name'] = df['description']
432
+ df_to_st['Is a type of'] = df['entity']
433
+ df_to_st['Related to'] = df['Matched Entity']
434
+ df_to_st['Is a type of'] = df_to_st['Is a type of'].replace({'PERSON': 'Person',
435
+ 'ORG': 'Organization',
436
+ 'GPE': 'Political Location',
437
+ 'NORP': 'Political or Religious Groups',
438
+ 'LOC': 'Non Political Location'})
439
+ gb = GridOptionsBuilder.from_dataframe(df_to_st)
440
+ gb.configure_pagination(paginationAutoPageSize=True) # Add pagination
441
+ gb.configure_side_bar() # Add a sidebar
442
+ gb.configure_selection('multiple', use_checkbox=True,
443
+ groupSelectsChildren="Group checkbox select children") # Enable multi-row selection
444
+ gridOptions = gb.build()
445
+
446
+ # st.dataframe(df_to_st)
447
+ grid_response = AgGrid(
448
+ df_to_st,
449
+ gridOptions=gridOptions,
450
+ data_return_mode='AS_INPUT',
451
+ update_mode='MODEL_CHANGED',
452
+ fit_columns_on_grid_load=False,
453
+ enable_enterprise_modules=True,
454
+ height=350,
455
+ width='100%',
456
+ reload_data=True
457
+ )
458
+
459
+ data = grid_response['data']
460
+ selected = grid_response['selected_rows']
461
+ selected_df = pd.DataFrame(selected)
462
+ if not selected_df.empty:
463
+ selected_entity = selected_df[['Name', 'Is a type of', 'Related to']]
464
+ st.dataframe(selected_entity)
465
+
466
+ # with right:
467
+ # st.json(st.session_state['json'])
468
+
469
+ entities_list = df['description']
470
+ # selected_entity = st.selectbox('Which entity you want to choose?',
471
+ # entities_list)
472
+ if not selected_df.empty and selected_entity['Name'].any():
473
+
474
+ # lookup_url = rf'https://lookup.dbpedia.org/api/search?query={selected_entity}'
475
+ # r = requests.get(lookup_url)
476
+
477
+ selected_row = df.loc[df['description'] == selected_entity['Name'][0]]
478
+
479
+ entity_value = selected_row.values
480
+ # st.write('Entity is a ', entity_value[0][0])
481
+ label, name, fuzzy, related, related_match, _, _, _ = entity_value[0]
482
+ not_matched = [word for word in related if word not in related_match]
483
+ fuzzy = fuzzy[0] if len(fuzzy) > 0 else ''
484
+ related = related[0] if len(related) > 0 else ''
485
+ not_matched = not_matched[0] if len(not_matched) > 0 else related
486
+
487
+ related_entity_list = [name, fuzzy, not_matched]
488
+ related_entity = entity_value[0][1:]
489
+
490
+ google_query_term = ' '.join(related_entity_list)
491
+ # search()
492
+ try:
493
+ urls = [i for i in search(google_query_term, stop=10, pause=2.0, tld='com', lang='en', tbs='0',
494
+ user_agent=get_random_user_agent())]
495
+ except:
496
+ urls = []
497
+ # urls = search(google_query_term+' news latest', num_results=10)
498
+ st.session_state['wiki_summary'] = False
499
+ all_related_entity = []
500
+ for el in related_entity[:-2]:
501
+ if isinstance(el, str):
502
+ all_related_entity.append(el)
503
+ elif isinstance(el, int):
504
+ all_related_entity.append(str(el))
505
+ else:
506
+ all_related_entity.extend(el)
507
+ # [ if type(el) == 'int' all_related_entity.extend(el) else all_related_entity.extend([el])for el in related_entity]
508
+ for entity in all_related_entity:
509
+ # try:
510
+ if True:
511
+ if entity:
512
+ entity = entity.replace(' ', '_')
513
+ query = f'''
514
+ SELECT ?name ?comment ?image
515
+ WHERE {{ dbr:{entity} rdfs:label ?name.
516
+ dbr:{entity} rdfs:comment ?comment.
517
+ dbr:{entity} dbo:thumbnail ?image.
518
+
519
+ FILTER (lang(?name) = 'en')
520
+ FILTER (lang(?comment) = 'en')
521
+ }}'''
522
+ sparql.setQuery(query)
523
+
524
+ sparql.setReturnFormat(JSON)
525
+ qres = sparql.query().convert()
526
+ if qres['results']['bindings']:
527
+ result = qres['results']['bindings'][0]
528
+ name, comment, image_url = result['name']['value'], result['comment']['value'], result['image'][
529
+ 'value']
530
+ # urllib.request.urlretrieve(image_url, "img.jpg")
531
+
532
+ # img = Image.open("/Users/anujkarn/NER/img.jpg")
533
+ wiki_url = f'https://en.wikipedia.org/wiki/{entity}'
534
+
535
+ st.write(name)
536
+ # st.image(img)
537
+ st.write(image_url)
538
+ # try:
539
+ response = requests.get(image_url)
540
+ try:
541
+ related_image = Image.open(BytesIO(response.content))
542
+ st.image(related_image)
543
+ except UnidentifiedImageError:
544
+ st.write('Not able to get image')
545
+ pass
546
+
547
+ # except error as e:
548
+ # st.write(f'Image not parsed because of : {e}')
549
+ summary_entity = comment
550
+ wiki_knowledge_df = pd.read_html(wiki_url)[0]
551
+ wiki_knowledge_df = filter_wiki_df(wiki_knowledge_df)
552
+
553
+ st.write('Showing desciption for entity:', name)
554
+ st.dataframe(wiki_knowledge_df)
555
+ # if st.button('Want something else?'):
556
+ # summary_entity = get_entity_from_selectbox(all_related_entity)
557
+ break
558
+ # summary_entity = wikipedia.summary(entity, 3)
559
+ else:
560
+ summary_entity = None
561
+ if not summary_entity:
562
+ try:
563
+ summary_entity = get_entity_from_selectbox(all_related_entity)
564
+ # page = WikipediaPage(entity)
565
+
566
+ except wikipedia.exceptions.DisambiguationError:
567
+ st.write('Disambiguation is there for term')
568
+
569
+ if selected_entity['Name'].any():
570
+ st.write(f'Summary for {selected_entity["Name"][0]}')
571
+ st.write(summary_entity)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://huggingface.co/spacy/en_core_web_lg/resolve/main/en_core_web_lg-any-py3-none-any.whl
2
+ fastapi==0.88.0
3
+ fuzzywuzzy==0.18.0
4
+ matplotlib==3.3.4
5
+ newspaper3k==0.2.8
6
+ nltk==3.6.1
7
+ numpy==1.19.5
8
+ pandas==1.2.4
9
+ Pillow==9.3.0
10
+ requests==2.25.1
11
+ spacy
12
+ SPARQLWrapper==2.0.0
13
+ streamlit==1.11.1
14
+ wikipedia==1.4.0
15
+ streamlit-aggrid
16
+ transformers==2.5.0