Spaces:

ksvmuralidhar
/

news_classifier_api

Sleeping

App Files Files Community

ksvmuralidhar commited on Aug 22, 2024

Commit

83d8595

verified ·

1 Parent(s): bdb5934

Upload 10 files

Browse files

Files changed (10) hide show

Dockerfile +58 -0
api.py +185 -0
calibrated_classifier.py +85 -0
classification_models/calibrated_model.bin +3 -0
classification_models/label_encoder.bin +3 -0
classification_models/model.tflite +3 -0
config.py +2 -0
news_classifier.py +56 -0
requirements.txt +12 -0
scraper.py +71 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,58 @@

+FROM python:3.10-slim
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN apt update && apt install -y ffmpeg
+RUN apt -y install wget
+RUN apt -y install unzip
+RUN apt-get install -y \
+    gnupg \
+    ca-certificates \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libfontconfig1 \
+    libnss3 \
+    libatk-bridge2.0-0 \
+    libatk1.0-0 \
+    libatspi2.0-0 \
+    libcups2 \
+    libcurl4 \
+    libgtk-3-0 \
+    libnspr4 \
+    libxcomposite1 \
+    libxdamage1 \
+    xdg-utils \
+    fonts-liberation \
+    libu2f-udev \
+    && rm -rf /var/lib/apt/lists/*
+RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
+    dpkg -i google-chrome-stable_current_amd64.deb && \
+    apt-get -f install -y && \
+    rm google-chrome-stable_current_amd64.deb
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    CHROMEDRIVERURL=https://storage.googleapis.com/chrome-for-testing-public/127.0.6533.119/linux64/chromedriver-linux64.zip \
+    CHROMEDRIVERFILENAME=chromedriver-linux64.zip
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+RUN wget -P $HOME/app $CHROMEDRIVERURL
+RUN unzip $HOME/app/$CHROMEDRIVERFILENAME
+RUN rm $HOME/app/$CHROMEDRIVERFILENAME
+RUN chmod +x $HOME/app/chromedriver-linux64/chromedriver
+RUN ls -ltr
+EXPOSE 7860
+ENTRYPOINT ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "3"]

api.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import cloudpickle
+import os
+import tensorflow as tf
+from scraper import scrape_text
+from fastapi import FastAPI, Response, Request
+from typing import List, Dict
+from pydantic import BaseModel, Field
+from fastapi.exceptions import RequestValidationError
+import uvicorn
+import json
+import logging
+import multiprocessing
+from news_classifier import predict_news_classes
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["TF_USE_LEGACY_KERAS"] = "1"
+def load_model():
+    logging.warning('Entering load transformer')
+    with open("classification_models/label_encoder.bin", "rb") as model_file_obj:
+        label_encoder = cloudpickle.load(model_file_obj)
+    with open("classification_models/calibrated_model.bin", "rb") as model_file_obj:
+        calibrated_model = cloudpickle.load(model_file_obj)
+    tflite_model_path = os.path.join("classification_models", "model.tflite")
+    calibrated_model.estimator.tflite_model_path = tflite_model_path
+    logging.warning('Exiting load transformer')
+    return calibrated_model, label_encoder
+async def scrape_urls(urls):
+    logging.warning('Entering scrape_urls()')
+    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
+    results = []
+    for url in urls:
+        f = pool.apply_async(scrape_text, [url]) # asynchronously scraping text
+        results.append(f) # appending result to results
+    scraped_texts = []
+    scrape_errors = []
+    for f in results:
+        t, e = f.get(timeout=120)
+        scraped_texts.append(t)
+        scrape_errors.append(e)
+    pool.close()
+    pool.join()
+    logging.warning('Exiting scrape_urls()')
+    return scraped_texts, scrape_errors
+description = '''API to classify news articles into categories from their URLs.\n
+Categories = ASTROLOGY, BUSINESS, EDUCATION, ENTERTAINMENT, HEALTH, NATION, SCIENCE, SPORTS, TECHNOLOGY, WEATHER, WORLD'''
+app = FastAPI(title='News Classifier API',
+              description=description,
+              version="0.0.1",
+              contact={
+                  "name": "Author: KSV Muralidhar",
+                  "url": "https://ksvmuralidhar.in"
+              },
+             license_info={
+                 "name": "License: MIT",
+                 "identifier": "MIT"
+             },
+             swagger_ui_parameters={"defaultModelsExpandDepth": -1})
+class URLList(BaseModel):
+    urls: List[str] = Field(..., description="List of URLs of news articles to classify")
+    key: str = Field(..., description="Authentication Key")
+class Categories(BaseModel):
+    label: str = Field(..., description="category label")
+    calibrated_prediction_proba: float = Field(...,
+                                               description="calibrated prediction probability (confidence)")
+class SuccessfulResponse(BaseModel):
+    urls: List[str] = Field(..., description="List of URLs of news articles inputted by the user")
+    scraped_texts: List[str] = Field(..., description="List of scraped text from input URLs")
+    scrape_errors: List[str] = Field(..., description="List of errors raised during scraping. One item for corresponding URL")
+    category: Categories = Field(..., description="Dict of category label of news articles along with calibrated prediction_proba")
+    classifier_error: str = Field("", description="Empty string as the response code is 200")
+class AuthenticationError(BaseModel):
+    urls: List[str] = Field(..., description="List of URLs of news articles inputted by the user")
+    scraped_texts: str = Field("", description="Empty string as authentication failed")
+    scrape_errors: str = Field("", description="Empty string as authentication failed")
+    category: str = Field("", description="Empty string as authentication failed")
+    classifier_error: str = Field("Error: Authentication error: Invalid API key.")
+class ClassifierError(BaseModel):
+    urls: List[str] = Field(..., description="List of URLs of news articles inputted by the user")
+    scraped_texts: List[str] = Field(..., description="List of scraped text from input URLs")
+    scrape_errors: List[str] = Field(..., description="List of errors raised during scraping. One item for corresponding URL")
+    category: str = Field("", description="Empty string as classifier encountered an error")
+    classifier_error: str = Field("Error: Classifier Error with a message describing the error")
+class InputValidationError(BaseModel):
+    urls: List[str] = Field(..., description="List of URLs of news articles inputted by the user")
+    scraped_texts: str = Field("", description="Empty string as validation failed")
+    scrape_errors: str = Field("", description="Empty string as validation failed")
+    category: str = Field("", description="Empty string as validation failed")
+    classifier_error: str = Field("Validation Error with a message describing the error")
+class NewsClassifierAPIAuthenticationError(Exception):
+    pass
+class NewsClassifierAPIScrapingError(Exception):
+    pass
+def authenticate_key(api_key: str):
+    if api_key != os.getenv('API_KEY'):
+        raise NewsClassifierAPIAuthenticationError("Authentication error: Invalid API key.")
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    urls = request.query_params.getlist("urls")
+    error_details = exc.errors()
+    error_messages = []
+    for error in error_details:
+        loc = [*map(str, error['loc'])][-1]
+        msg = error['msg']
+        error_messages.append(f"{loc}: {msg}")
+    error_message = "; ".join(error_messages) if error_messages else ""
+    response_json = {'urls': urls, 'scraped_texts': '', 'scrape_errors': '', 'categories': "", 'classifier_error': f'Validation Error: {error_message}'}
+    json_str = json.dumps(response_json, indent=5) # convert dict to JSON str
+    return Response(content=json_str, media_type='application/json', status_code=422)
+calibrated_model, label_encoder = load_model()
+@app.post("/classify/", tags=["Classify"], response_model=List[SuccessfulResponse],
+         responses={
+        401: {"model": AuthenticationError, "description": "Authentication Error: Returned when the entered API key is incorrect"},
+        500: {"model": ClassifierError, "description": "Classifier Error: Returned when the API couldn't classify even a single article"},
+        422: {"model": InputValidationError, "description": "Validation Error: Returned when the payload data doesn't satisfy the data type requirements"}
+         })
+async def classify(q: URLList):
+    """
+    Get categories of news articles by passing the list of URLs as input.
+    - **urls**: List of URLs (required)
+    - **key**: Authentication key (required)
+    """
+    try:
+        logging.warning("Entering classify()")
+        urls = ""
+        scraped_texts = ""
+        scrape_errors = ""
+        labels = ""
+        probs = 0
+        request_json = q.json()
+        request_json = json.loads(request_json)
+        urls = request_json['urls']
+        api_key = request_json['key']
+        _ = authenticate_key(api_key)
+        scraped_texts, scrape_errors = await scrape_urls(urls)
+        unique_scraped_texts = [*set(scraped_texts)]
+        if (unique_scraped_texts[0] == "") and (len(unique_scraped_texts) == 1):
+            raise NewsClassifierAPIScrapingError("Scrape Error: Couldn't scrape text from any of the URLs")
+        labels, probs = await predict_news_classes(urls, scraped_texts, calibrated_model, label_encoder)
+        label_prob = [{"label": "", "calibrated_prediction_proba": 0}
+                      if t == "" else {"label": l, "calibrated_prediction_proba": p}
+                      for l, p, t in zip(labels, probs, scraped_texts)]
+        status_code = 200
+        response_json = {'urls': urls, 'scraped_texts': scraped_texts, 'scrape_errors': scrape_errors, 'categories': label_prob, 'classifer_error': ''}
+    except Exception as e:
+        status_code = 500
+        if e.__class__.__name__ == "NewsClassifierAPIAuthenticationError":
+            status_code = 401
+        response_json = {'urls': urls, 'scraped_texts': scraped_texts, 'scrape_errors': scrape_errors, 'categories': "", 'classifier_error': f'Error: {e}'}
+    json_str = json.dumps(response_json, indent=5) # convert dict to JSON str
+    return Response(content=json_str, media_type='application/json', status_code=status_code)
+if __name__ == '__main__':
+    uvicorn.run(app=app, host='0.0.0.0', port=7860, workers=3)

calibrated_classifier.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from sklearn.dummy import DummyClassifier
+from tqdm import tqdm
+import multiprocessing
+import numpy as np
+import tensorflow as tf
+from transformers import DistilBertTokenizerFast
+class PredictProba(DummyClassifier):
+    def __init__(self, tflite_model_path: str, classes_: list, n_tokens: int):
+        self.classes_ = classes_ # required attribute for an estimator to be used in calibration classifier
+        self.n_tokens = n_tokens
+        self.tflite_model_path = tflite_model_path
+    def fit(self, x, y):
+        print('called fit')
+        return self # fit method is required for an estimator to be used in calibration classifier
+    @staticmethod
+    def get_token_batches(attention_mask, input_ids, batch_size: int=8):
+        n_texts = len(attention_mask)
+        n_batches = int(np.ceil(n_texts / batch_size))
+        if n_texts <= batch_size:
+            n_batches = 1
+        attention_mask_batches = []
+        input_ids_batches = []
+        for i in range(n_batches):
+            if i != n_batches-1:
+                attention_mask_batches.append(attention_mask[i*batch_size: batch_size*(i+1)])
+                input_ids_batches.append(input_ids[i*batch_size: batch_size*(i+1)])
+            else:
+                attention_mask_batches.append(attention_mask[i*batch_size:])
+                input_ids_batches.append(input_ids[i*batch_size:])
+        return attention_mask_batches, input_ids_batches
+    def get_batch_inference(self, batch_size, attention_mask, input_ids):
+        interpreter = tf.lite.Interpreter(model_path=self.tflite_model_path)
+        interpreter.allocate_tensors()
+        input_details = interpreter.get_input_details()
+        output_details = interpreter.get_output_details()[0]
+        interpreter.resize_tensor_input(input_details[0]['index'],[batch_size, self.n_tokens])
+        interpreter.resize_tensor_input(input_details[1]['index'],[batch_size, self.n_tokens])
+        interpreter.resize_tensor_input(output_details['index'],[batch_size, len(self.classes_)])
+        interpreter.allocate_tensors()
+        interpreter.set_tensor(input_details[0]["index"], attention_mask)
+        interpreter.set_tensor(input_details[1]["index"], input_ids)
+        interpreter.invoke()
+        tflite_pred = interpreter.get_tensor(output_details["index"])
+        return tflite_pred
+    def inference(self, texts):
+        model_checkpoint = "distilbert-base-uncased"
+        tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)
+        tokens = tokenizer(texts, max_length=self.n_tokens, padding="max_length",
+                           truncation=True, return_tensors="tf")
+        attention_mask, input_ids = tokens['attention_mask'], tokens['input_ids']
+        attention_mask_batches, input_ids_batches = self.get_token_batches(attention_mask, input_ids)
+        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
+        results = []
+        for attention_mask, input_ids in zip(attention_mask_batches, input_ids_batches):
+            f = pool.apply_async(self.get_batch_inference, args=(len(attention_mask), attention_mask, input_ids))
+            results.append(f)
+        all_predictions = np.array([])
+        for n_batch in tqdm(range(len(results))):
+            tflite_pred = results[n_batch].get(timeout=360)
+            if n_batch == 0:
+                all_predictions = tflite_pred
+            else:
+                all_predictions = np.concatenate((all_predictions, tflite_pred), axis=0)
+        return all_predictions
+    def predict_proba(self, X, y=None):
+        predict_prob = self.inference(X)
+        return predict_prob

classification_models/calibrated_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b154508f547e10b14021fb7b004dc6c25558fbe1c6942706cfa843b6976a2ac2
+size 4293

classification_models/label_encoder.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92b26f07332ebdd93c8f8f2e8378ecba05415eb3ae6713bd9b1f4289d921c26f
+size 370

classification_models/model.tflite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15140df1981b0d4edd6009ac340a481e344ffb663fc449ae2fec1e69ee931615
+size 67002528

config.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ SCRAPER_TIMEOUT = 20
2	+ CHROME_DRIVER_PATH = "./chromedriver-linux64/chromedriver"

news_classifier.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import numpy as np
+import tensorflow as tf
+import logging
+def find_path(url):
+    if url == '':
+        return ''
+    url = url.replace("-/-", "-")
+    url_split = url.replace("https://", "")
+    url_split = url_split.replace("www.", "")
+    url_split = url_split.strip()
+    url = url.replace("//", "/")
+    url = url.replace("https/timesofindia-indiatimes-com", "")
+    url_split = url_split.split("/")
+    url_split = [u for u in url_split if (u != "") and
+                                         (u != "articleshow") and
+                                         (u.find(".cms")==-1) and
+                                         (u.find(".ece")==-1) and
+                                         (u.find(".htm")==-1) and
+                                         (len(u.split('-')) <= 5) and
+                                         (u.find(" ") == -1)
+                ]
+    if len(url_split) > 2:
+        url_split = "/".join(url_split[1:])
+    else:
+        if len(url_split) > 0:
+            url_split = url_split[-1]
+        else:
+            url_split = '-'
+    return url_split
+async def parse_prediction(tflite_pred, label_encoder):
+    tflite_pred_argmax = np.argmax(tflite_pred, axis=1)
+    tflite_pred_label = label_encoder.inverse_transform(tflite_pred_argmax)
+    tflite_pred_prob = np.max(tflite_pred, axis=1)
+    return tflite_pred_label, tflite_pred_prob
+async def model_inference(text: list, calibrated_model, label_encoder):
+    logging.info('Entering news_classifier.model_inference()')
+    logging.info(f'Samples to predict: {len(text)}')
+    if text != "":
+        tflite_pred = calibrated_model.predict_proba(text)
+        tflite_pred = await parse_prediction(tflite_pred, label_encoder)
+    logging.info('Exiting news_classifier.model_inference()')
+    return tflite_pred
+async def predict_news_classes(urls: list, texts: list, calibrated_model, label_encoder):
+    url_paths = [*map(find_path, urls)]
+    paths_texts = [f"{p}. {t}" for p, t in zip(url_paths, texts)]
+    label, prob = await model_inference(paths_texts, calibrated_model, label_encoder)
+    return label, prob

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers==4.39.3
+tensorflow==2.15.0
+unidecode
+tf-keras==2.15.0
+selenium==4.19.0
+fastapi
+pydantic
+uvicorn
+undetected-chromedriver
+scikit-learn==1.2.2
+cloudpickle
+numpy==1.24.3

scraper.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from selenium import webdriver
+from selenium.webdriver.common.by import By
+import undetected_chromedriver as uc
+import re
+import logging
+import os
+import time
+import random
+from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH
+def get_text(url, n_words=15):
+    try:
+        driver = None
+        logging.warning(f"Initiated Scraping {url}")
+        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+        options = uc.ChromeOptions()
+        options.add_argument("--headless")
+        options.add_argument(f"user-agent={user_agent}")
+        options.add_argument("--blink-settings=imagesEnabled=false")
+        options.add_argument("--disable-images")
+        options.add_argument("--disable-blink-features=AutomationControlled")
+        options.add_argument("--disable-dev-shm-usage")
+        # options.add_argument("--disable-extensions")
+        # options.add_argument("--autoplay-policy=no-user-gesture-required")
+        # options.add_argument("--disable-infobars")
+        # options.add_argument("--disable-gpu")
+        driver = uc.Chrome(version_main=127, options=options, driver_executable_path=CHROME_DRIVER_PATH)
+        time.sleep(random.uniform(0.5, 1.5))
+        driver.set_page_load_timeout(SCRAPER_TIMEOUT)
+        driver.set_script_timeout(SCRAPER_TIMEOUT)
+        driver.implicitly_wait(3)
+        driver.get(url)
+        elem = driver.find_element(By.TAG_NAME, "body").text
+        sents = elem.split("\n")
+        sentence_list = []
+        for sent in sents:
+            sent = sent.strip()
+            if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
+                sentence_list.append(sent)
+        driver.close()
+        driver.quit()
+        logging.warning("Closed Webdriver")
+        logging.warning("Successfully scraped text")
+        if len(sentence_list) < 3:
+            raise Exception("Found nothing to scrape.")
+        return "\n".join(sentence_list), ""
+    except Exception as e:
+        logging.warning(str(e))
+        if driver:
+            driver.close()
+            driver.quit()
+            logging.warning("Closed Webdriver")
+        err_msg = str(e).split('\n')[0]
+        return "", err_msg
+def scrape_text(url, n_words=15,max_retries=2):
+    scraped_text = ""
+    scrape_error = ""
+    try:
+        n_tries = 1
+        while (n_tries <= max_retries) and (scraped_text == ""):
+            scraped_text, scrape_error = get_text(url=url, n_words=n_words)
+            n_tries += 1
+        return scraped_text, scrape_error
+    except Exception as e:
+        err_msg = str(e).split('\n')[0]
+        return "", err_msg