File size: 2,154 Bytes
00320ff 9f1606d 3b3dbc9 00320ff 320952b 9f1606d 320952b 00320ff 3b3dbc9 1d197a9 00320ff 9f1606d 00320ff 3b3dbc9 00320ff 320952b 00320ff 3cc406f 00320ff 320952b 00320ff 1d197a9 320952b 00320ff 9f1606d 320952b 00320ff 1d197a9 00320ff 320952b 00320ff 320952b 9f1606d 320952b 00320ff 3b3dbc9 00320ff 3b3dbc9 00320ff 320952b 9f1606d 3cc406f 3b3dbc9 3cc406f 1d197a9 9f1606d 1d197a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import pandas as pd
import gradio as gr
import re
from transformers import pipeline
from scraper import getComments
def chunk(a):
n = round(0.3 * len(a))
k, m = divmod(len(a), n)
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def preprocessText(df):
df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
return df
def summarizer(url: str, summary_length: str = "Short") -> str:
# pushshift.io submission comments api doesn't work so have to use praw
df = preprocessText(getComments(url=url))
smax = df.score.max()
threshold = round(0.05 * smax)
df = df[df.score >= threshold]
# empirically, having more than 200 comments doesn't change much but slows down the summarizer.
if len(df.text) >= 200:
df = df[:200]
# chunking to handle giving the model too large of an input which crashes
chunked = list(chunk(df.text))
nlp = pipeline('summarization', model="./model/")
lst_summaries = []
for grp in chunked:
# treating a group of comments as one block of text
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
lst_summaries.append(result)
stext = ' '.join(lst_summaries).replace(" .", ".")
if summary_length == "Short":
thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
return df.submission_title.unique()[0] + '\n' + '\n' + thread_summary
else:
return df.submission_title.unique()[0] + '\n' + '\n' + stext
if __name__ == "__main__":
with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
submission_url = gr.Textbox(label='Post URL')
length_choice = gr.Radio(label='Summary Length', value="Short", choices=["Short", "Long"])
sub_btn = gr.Button("Summarize")
summary = gr.Textbox(label='Comment Summary')
sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
demo.launch(server_port=8080, enable_queue=False)
|