Spaces:

SaiedAlshahrani
/

Egyptian-Wikipedia-Scanner

Sleeping

App Files Files Community

SaiedAlshahrani commited on Mar 18, 2024

Commit

45110eb

verified ·

1 Parent(s): c607808

Upload 5 files

Browse files

Files changed (5) hide show

README.md +5 -5
XGBoost_metadata+camelbert_embeddings.model +3 -0
requirements.txt +13 -0
scanner.py +91 -0
scanner_utils.py +95 -0

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
 title: Egyptian Wikipedia Scanner
-emoji: 📊
-colorFrom: yellow
-colorTo: pink
 sdk: streamlit
-sdk_version: 1.32.2
-app_file: app.py
 pinned: false
 license: mit
 ---

 ---
 title: Egyptian Wikipedia Scanner
+emoji: 🔍
+colorFrom: gray
+colorTo: red
 sdk: streamlit
+sdk_version: 1.31.1
+app_file: scanner.py
 pinned: false
 license: mit
 ---

XGBoost_metadata+camelbert_embeddings.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9c5af4de2e308394a520a60a309755964f33554bf53b4b1942c5d3b5aa8e1b7
+size 21531

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+numpy
+torch
+typing
+pandas
+xgboost
+requests
+wikipedia
+streamlit
+torchvision
+scikit-learn
+transformers
+beautifulsoup4
+streamlit-searchbox

scanner.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import logging
+import warnings
+import wikipedia
+import streamlit as st
+from typing import List
+from scanner_utils import *
+from xgboost import XGBClassifier
+from streamlit_searchbox import st_searchbox
+from transformers import logging as hflogging
+logging.disable(logging.WARNING)
+hflogging.set_verbosity_warning()
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+warnings.simplefilter(action='ignore', category=DeprecationWarning)
+st.set_page_config(layout="centered", page_title="Egyptian Wikipedia Scanner", page_icon="🇪🇬")
+wikipedia.set_lang("arz")
+with open('.streamlit/style.css') as f:
+    st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+st.markdown("""
+            <h1 style='text-align: center';>Egyptian Arabic Wikipedia Scanner</h1>
+            <h5 style='text-align: center';>Automatic Detection of Template-translated Articles in the Egyptian Wikipedia</h5>
+            """, unsafe_allow_html=True)
+st.markdown("", unsafe_allow_html=True)
+def search_wikipedia(searchterm: str) -> List[any]:
+    return wikipedia.search(searchterm) if searchterm else []
+@st.cache_resource
+def load_xgb_model(model):
+    loaded_xgb_classifier = XGBClassifier()
+    loaded_xgb_classifier.load_model(model)
+    return loaded_xgb_classifier
+selected_title = st_searchbox(search_wikipedia, label="Search for an article in Egyptian Arabic Wikipedia:",
+                              placeholder="Search for an article", rerun_on_update=True, clear_on_submit=False, key="wiki_searchbox")
+if selected_title:
+    X, article, dataframe, selected_title = prepare_features(selected_title)
+    st.write(f':black_small_square: Collected Metadata of **{selected_title}**')
+    st.dataframe(dataframe, hide_index=True , use_container_width=True)
+    loaded_xgb_classifier = load_xgb_model("XGBoost_metadata+camelbert_embeddings.model")
+    id2label = {0:'Human-generated Article', 1:'Template-translated Article'}
+    result = id2label[int(loaded_xgb_classifier.predict(X))]
+    if result =='Human-generated Article':
+        st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
+        st.success(result, icon="✅")
+    else:
+        st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
+        st.error(result, icon="🚨")
+    st.write(f":black_small_square: Full Summary of **{selected_title}**")
+    with st.expander(f'**{selected_title}**', expanded=True):
+        st.markdown('<style>p {text-align: justify;}</style>', unsafe_allow_html=True)
+        try:
+            article_text = wikipedia.summary(selected_title)
+        except wikipedia.exceptions.DisambiguationError as e:
+            article_text = wikipedia.summary(e.options[0])
+        st.write(article_text)
+        st.write(f'> :globe_with_meridians: Read Full Text of **{selected_title}**: <br>{article.url}', unsafe_allow_html=True)
+    st.markdown('<br><br>', unsafe_allow_html=True)
+footer="""
+        <div class="footer"> <p class="p1">Copyright © 2024 by *****************<br>Hosted with Hugging Face Spaces 🤗</p> </div>
+"""
+st.markdown(footer, unsafe_allow_html=True)

scanner_utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import re
+import requests
+import wikipedia
+import numpy as np
+import pandas as pd
+import streamlit as st
+from bs4 import BeautifulSoup
+from transformers import AutoModel
+from transformers import BertTokenizer
+def clean_page_text(text):
+    text = re.sub(r'[^\w\s]', ' ', text) #Replaces the non-alphanumeric characters with spaces.
+    text = re.sub(r'[^\u0600-\u06FF]', ' ', text) #Replaces the non-Arabic characters with spaces.
+    text = re.sub(r'\s+', ' ', text) #Replaces extra spaces with a single space.
+    return text
+@st.cache_resource
+def encode_page_text(page_text):
+    tokenizer = BertTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy')
+    model = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy')
+    tokenized_page_text  = tokenizer(page_text, return_tensors='pt', max_length=512, truncation=True)
+    encoded_page_text = model(**tokenized_page_text)[0][0][0].tolist()
+    return encoded_page_text
+@st.cache_data
+def get_page_info(title):
+    page_info = f"https://xtools.wmcloud.org/api/page/articleinfo/arz.wikipedia.org/{title}?format=json"
+    creation_date = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['created_at']
+    creator_name = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['author']
+    total_edits = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['revisions']
+    total_editors = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['editors']
+    return creation_date, creator_name, total_edits, total_editors
+@st.cache_data
+def get_page_prose(title):
+    page_prose = f"https://xtools.wmcloud.org/api/page/prose/arz.wikipedia.org/{title}?format=json"
+    total_bytes = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['bytes']
+    total_words = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['words']
+    total_chars = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['characters']
+    return total_bytes, total_words, total_chars
+@st.cache_data
+def prepare_features(selected_title):
+    dataframe = get_metadata_features(selected_title)
+    try:
+        article = wikipedia.page(selected_title)
+        full_article_text = clean_page_text(article.content)
+    except wikipedia.exceptions.DisambiguationError as e:
+        selected_title = e.options[0]
+        article = wikipedia.page(selected_title)
+        full_article_text = clean_page_text(article.content)
+    encode_full_article_text = encode_page_text(full_article_text)
+    X = []
+    for i in range(dataframe.shape[0]):
+        x = []
+        x.append(dataframe['Total Edits'][i])
+        x.append(dataframe['Total Editors'][i])
+        x.append(dataframe['Total Bytes'][i])
+        x.append(dataframe['Total Characters'][i])
+        x.append(dataframe['Total Words'][i])
+        # Both page_metadata + page_text_embeddings
+        X.append(np.hstack([x, list(encode_full_article_text)]))
+    return X, article, dataframe, selected_title
+@st.cache_data
+def get_metadata_features(selected_title):
+    creation_date, creator_name, total_edits, total_editors = get_page_info(selected_title)
+    total_bytes, total_words, total_chars = get_page_prose(selected_title)
+    data = {'Total Edits':[total_edits], 'Total Editors':[total_editors], 'Total Bytes':[total_bytes],
+            'Total Characters':[total_chars], 'Total Words':[total_words], 'Creator Name':[creator_name],
+            'Creation Date':[creation_date]}
+    dataframe = pd.DataFrame(data)
+    return dataframe