|
import pandas as pd |
|
import gradio as gr |
|
import re |
|
from transformers import pipeline |
|
from scraper import getComments |
|
|
|
|
|
def chunk(a): |
|
n = round(0.3 * len(a)) |
|
k, m = divmod(len(a), n) |
|
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) |
|
|
|
|
|
def preprocessText(df): |
|
df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M)) |
|
df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M)) |
|
return df |
|
|
|
|
|
def summarizer(url: str, summary_length: str = "Short") -> str: |
|
|
|
|
|
|
|
df = preprocessText(getComments(url=url)) |
|
|
|
smax = df.score.max() |
|
|
|
threshold = round(0.05 * smax) |
|
|
|
df = df[df.score >= threshold] |
|
|
|
|
|
if len(df.text) >= 200: |
|
df = df[:200] |
|
|
|
|
|
chunked = list(chunk(df.text)) |
|
|
|
nlp = pipeline('summarization', model="./model/") |
|
|
|
lst_summaries = [] |
|
|
|
for grp in chunked: |
|
|
|
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"] |
|
lst_summaries.append(result) |
|
|
|
stext = ' '.join(lst_summaries).replace(" .", ".") |
|
|
|
if summary_length == "Short": |
|
thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".") |
|
return df.submission_title.unique()[0] + '\n' + '\n' + thread_summary |
|
else: |
|
return df.submission_title.unique()[0] + '\n' + '\n' + stext |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo: |
|
submission_url = gr.Textbox(label='Post URL') |
|
|
|
length_choice = gr.Radio(label='Summary Length', value="Short", choices=["Short", "Long"]) |
|
|
|
sub_btn = gr.Button("Summarize") |
|
|
|
summary = gr.Textbox(label='Comment Summary') |
|
|
|
sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary) |
|
|
|
demo.launch(server_port=8080, enable_queue=False) |
|
|