SaiedAlshahrani
commited on
Upload 5 files
Browse files- README.md +5 -5
- XGBoost_metadata+camelbert_embeddings.model +3 -0
- requirements.txt +13 -0
- scanner.py +91 -0
- scanner_utils.py +95 -0
README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
---
|
2 |
title: Egyptian Wikipedia Scanner
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
|
|
1 |
---
|
2 |
title: Egyptian Wikipedia Scanner
|
3 |
+
emoji: π
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: red
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.31.1
|
8 |
+
app_file: scanner.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
XGBoost_metadata+camelbert_embeddings.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9c5af4de2e308394a520a60a309755964f33554bf53b4b1942c5d3b5aa8e1b7
|
3 |
+
size 21531
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
torch
|
3 |
+
typing
|
4 |
+
pandas
|
5 |
+
xgboost
|
6 |
+
requests
|
7 |
+
wikipedia
|
8 |
+
streamlit
|
9 |
+
torchvision
|
10 |
+
scikit-learn
|
11 |
+
transformers
|
12 |
+
beautifulsoup4
|
13 |
+
streamlit-searchbox
|
scanner.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import warnings
|
3 |
+
import wikipedia
|
4 |
+
import streamlit as st
|
5 |
+
from typing import List
|
6 |
+
from scanner_utils import *
|
7 |
+
from xgboost import XGBClassifier
|
8 |
+
from streamlit_searchbox import st_searchbox
|
9 |
+
from transformers import logging as hflogging
|
10 |
+
|
11 |
+
|
12 |
+
logging.disable(logging.WARNING)
|
13 |
+
hflogging.set_verbosity_warning()
|
14 |
+
|
15 |
+
warnings.simplefilter(action='ignore', category=UserWarning)
|
16 |
+
warnings.simplefilter(action='ignore', category=FutureWarning)
|
17 |
+
warnings.simplefilter(action='ignore', category=DeprecationWarning)
|
18 |
+
|
19 |
+
st.set_page_config(layout="centered", page_title="Egyptian Wikipedia Scanner", page_icon="πͺπ¬")
|
20 |
+
|
21 |
+
wikipedia.set_lang("arz")
|
22 |
+
|
23 |
+
|
24 |
+
with open('.streamlit/style.css') as f:
|
25 |
+
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
26 |
+
|
27 |
+
|
28 |
+
st.markdown("""
|
29 |
+
<h1 style='text-align: center';>Egyptian Arabic Wikipedia Scanner</h1>
|
30 |
+
<h5 style='text-align: center';>Automatic Detection of Template-translated Articles in the Egyptian Wikipedia</h5>
|
31 |
+
""", unsafe_allow_html=True)
|
32 |
+
|
33 |
+
|
34 |
+
st.markdown("", unsafe_allow_html=True)
|
35 |
+
|
36 |
+
|
37 |
+
def search_wikipedia(searchterm: str) -> List[any]:
|
38 |
+
return wikipedia.search(searchterm) if searchterm else []
|
39 |
+
|
40 |
+
|
41 |
+
@st.cache_resource
|
42 |
+
def load_xgb_model(model):
|
43 |
+
loaded_xgb_classifier = XGBClassifier()
|
44 |
+
loaded_xgb_classifier.load_model(model)
|
45 |
+
return loaded_xgb_classifier
|
46 |
+
|
47 |
+
|
48 |
+
selected_title = st_searchbox(search_wikipedia, label="Search for an article in Egyptian Arabic Wikipedia:",
|
49 |
+
placeholder="Search for an article", rerun_on_update=True, clear_on_submit=False, key="wiki_searchbox")
|
50 |
+
|
51 |
+
if selected_title:
|
52 |
+
X, article, dataframe, selected_title = prepare_features(selected_title)
|
53 |
+
|
54 |
+
st.write(f':black_small_square: Collected Metadata of **{selected_title}**')
|
55 |
+
|
56 |
+
st.dataframe(dataframe, hide_index=True , use_container_width=True)
|
57 |
+
|
58 |
+
loaded_xgb_classifier = load_xgb_model("XGBoost_metadata+camelbert_embeddings.model")
|
59 |
+
|
60 |
+
id2label = {0:'Human-generated Article', 1:'Template-translated Article'}
|
61 |
+
|
62 |
+
result = id2label[int(loaded_xgb_classifier.predict(X))]
|
63 |
+
|
64 |
+
if result =='Human-generated Article':
|
65 |
+
st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
|
66 |
+
st.success(result, icon="β
")
|
67 |
+
|
68 |
+
else:
|
69 |
+
st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
|
70 |
+
st.error(result, icon="π¨")
|
71 |
+
|
72 |
+
st.write(f":black_small_square: Full Summary of **{selected_title}**")
|
73 |
+
|
74 |
+
with st.expander(f'**{selected_title}**', expanded=True):
|
75 |
+
st.markdown('<style>p {text-align: justify;}</style>', unsafe_allow_html=True)
|
76 |
+
try:
|
77 |
+
article_text = wikipedia.summary(selected_title)
|
78 |
+
|
79 |
+
except wikipedia.exceptions.DisambiguationError as e:
|
80 |
+
article_text = wikipedia.summary(e.options[0])
|
81 |
+
st.write(article_text)
|
82 |
+
st.write(f'> :globe_with_meridians: Read Full Text of **{selected_title}**: <br>{article.url}', unsafe_allow_html=True)
|
83 |
+
|
84 |
+
|
85 |
+
st.markdown('<br><br>', unsafe_allow_html=True)
|
86 |
+
|
87 |
+
|
88 |
+
footer="""
|
89 |
+
<div class="footer"> <p class="p1">Copyright Β© 2024 by *****************<br>Hosted with Hugging Face Spaces π€</p> </div>
|
90 |
+
"""
|
91 |
+
st.markdown(footer, unsafe_allow_html=True)
|
scanner_utils.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import wikipedia
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import streamlit as st
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from transformers import AutoModel
|
9 |
+
from transformers import BertTokenizer
|
10 |
+
|
11 |
+
|
12 |
+
def clean_page_text(text):
|
13 |
+
text = re.sub(r'[^\w\s]', ' ', text) #Replaces the non-alphanumeric characters with spaces.
|
14 |
+
text = re.sub(r'[^\u0600-\u06FF]', ' ', text) #Replaces the non-Arabic characters with spaces.
|
15 |
+
text = re.sub(r'\s+', ' ', text) #Replaces extra spaces with a single space.
|
16 |
+
return text
|
17 |
+
|
18 |
+
|
19 |
+
@st.cache_resource
|
20 |
+
def encode_page_text(page_text):
|
21 |
+
tokenizer = BertTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy')
|
22 |
+
model = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy')
|
23 |
+
|
24 |
+
tokenized_page_text = tokenizer(page_text, return_tensors='pt', max_length=512, truncation=True)
|
25 |
+
encoded_page_text = model(**tokenized_page_text)[0][0][0].tolist()
|
26 |
+
|
27 |
+
return encoded_page_text
|
28 |
+
|
29 |
+
|
30 |
+
@st.cache_data
|
31 |
+
def get_page_info(title):
|
32 |
+
page_info = f"https://xtools.wmcloud.org/api/page/articleinfo/arz.wikipedia.org/{title}?format=json"
|
33 |
+
|
34 |
+
creation_date = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['created_at']
|
35 |
+
creator_name = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['author']
|
36 |
+
total_edits = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['revisions']
|
37 |
+
total_editors = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['editors']
|
38 |
+
|
39 |
+
return creation_date, creator_name, total_edits, total_editors
|
40 |
+
|
41 |
+
|
42 |
+
@st.cache_data
|
43 |
+
def get_page_prose(title):
|
44 |
+
page_prose = f"https://xtools.wmcloud.org/api/page/prose/arz.wikipedia.org/{title}?format=json"
|
45 |
+
|
46 |
+
total_bytes = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['bytes']
|
47 |
+
total_words = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['words']
|
48 |
+
total_chars = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['characters']
|
49 |
+
|
50 |
+
return total_bytes, total_words, total_chars
|
51 |
+
|
52 |
+
|
53 |
+
@st.cache_data
|
54 |
+
def prepare_features(selected_title):
|
55 |
+
dataframe = get_metadata_features(selected_title)
|
56 |
+
|
57 |
+
try:
|
58 |
+
article = wikipedia.page(selected_title)
|
59 |
+
full_article_text = clean_page_text(article.content)
|
60 |
+
|
61 |
+
except wikipedia.exceptions.DisambiguationError as e:
|
62 |
+
selected_title = e.options[0]
|
63 |
+
article = wikipedia.page(selected_title)
|
64 |
+
full_article_text = clean_page_text(article.content)
|
65 |
+
|
66 |
+
encode_full_article_text = encode_page_text(full_article_text)
|
67 |
+
|
68 |
+
X = []
|
69 |
+
|
70 |
+
for i in range(dataframe.shape[0]):
|
71 |
+
x = []
|
72 |
+
x.append(dataframe['Total Edits'][i])
|
73 |
+
x.append(dataframe['Total Editors'][i])
|
74 |
+
x.append(dataframe['Total Bytes'][i])
|
75 |
+
x.append(dataframe['Total Characters'][i])
|
76 |
+
x.append(dataframe['Total Words'][i])
|
77 |
+
|
78 |
+
# Both page_metadata + page_text_embeddings
|
79 |
+
X.append(np.hstack([x, list(encode_full_article_text)]))
|
80 |
+
|
81 |
+
return X, article, dataframe, selected_title
|
82 |
+
|
83 |
+
|
84 |
+
@st.cache_data
|
85 |
+
def get_metadata_features(selected_title):
|
86 |
+
creation_date, creator_name, total_edits, total_editors = get_page_info(selected_title)
|
87 |
+
total_bytes, total_words, total_chars = get_page_prose(selected_title)
|
88 |
+
|
89 |
+
data = {'Total Edits':[total_edits], 'Total Editors':[total_editors], 'Total Bytes':[total_bytes],
|
90 |
+
'Total Characters':[total_chars], 'Total Words':[total_words], 'Creator Name':[creator_name],
|
91 |
+
'Creation Date':[creation_date]}
|
92 |
+
|
93 |
+
dataframe = pd.DataFrame(data)
|
94 |
+
|
95 |
+
return dataframe
|