File size: 2,154 Bytes
00320ff
9f1606d
3b3dbc9
00320ff
 
 
 
320952b
9f1606d
320952b
 
00320ff
 
3b3dbc9
 
 
 
 
 
1d197a9
00320ff
9f1606d
00320ff
3b3dbc9
00320ff
320952b
00320ff
3cc406f
00320ff
320952b
00320ff
1d197a9
320952b
 
00320ff
9f1606d
320952b
00320ff
1d197a9
00320ff
320952b
00320ff
320952b
9f1606d
320952b
 
00320ff
3b3dbc9
00320ff
3b3dbc9
 
 
 
 
00320ff
 
320952b
9f1606d
3cc406f
 
 
3b3dbc9
 
3cc406f
 
 
 
1d197a9
9f1606d
1d197a9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import gradio as gr
import re
from transformers import pipeline
from scraper import getComments


def chunk(a):
    n = round(0.3 * len(a))
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


def preprocessText(df):
    df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
    df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
    return df


def summarizer(url: str, summary_length: str = "Short") -> str:

    # pushshift.io submission comments api doesn't work so have to use praw

    df = preprocessText(getComments(url=url))

    smax = df.score.max()

    threshold = round(0.05 * smax)

    df = df[df.score >= threshold]

    # empirically, having more than 200 comments doesn't change much but slows down the summarizer.
    if len(df.text) >= 200:
        df = df[:200]

    # chunking to handle giving the model too large of an input which crashes
    chunked = list(chunk(df.text))

    nlp = pipeline('summarization', model="./model/")

    lst_summaries = []

    for grp in chunked:
        # treating a group of comments as one block of text
        result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
        lst_summaries.append(result)

    stext = ' '.join(lst_summaries).replace(" .", ".")

    if summary_length == "Short":
        thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
        return df.submission_title.unique()[0] + '\n' + '\n' + thread_summary
    else:
        return df.submission_title.unique()[0] + '\n' + '\n' + stext


if __name__ == "__main__":

    with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
        submission_url = gr.Textbox(label='Post URL')

        length_choice = gr.Radio(label='Summary Length', value="Short", choices=["Short", "Long"])

        sub_btn = gr.Button("Summarize")

        summary = gr.Textbox(label='Comment Summary')

        sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)

    demo.launch(server_port=8080, enable_queue=False)