kkastr
commited on
Commit
•
3b3dbc9
1
Parent(s):
3cc406f
rename
Browse files- scraper.py +0 -1
- thread_analyzer.py → thread_summarizer.py +19 -11
scraper.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import praw
|
2 |
import pandas as pd
|
3 |
-
from tqdm import tqdm
|
4 |
from api_keys import client_id, client_secret, user_agent, username
|
5 |
|
6 |
|
|
|
1 |
import praw
|
2 |
import pandas as pd
|
|
|
3 |
from api_keys import client_id, client_secret, user_agent, username
|
4 |
|
5 |
|
thread_analyzer.py → thread_summarizer.py
RENAMED
@@ -1,5 +1,6 @@
|
|
1 |
import pandas as pd
|
2 |
import gradio as gr
|
|
|
3 |
from transformers import pipeline
|
4 |
from scraper import getComments
|
5 |
|
@@ -10,11 +11,17 @@ def chunk(a):
|
|
10 |
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
11 |
|
12 |
|
13 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# pushshift.io submission comments api doesn't work so have to use praw
|
16 |
|
17 |
-
df = getComments(url=url)
|
18 |
|
19 |
smax = df.score.max()
|
20 |
|
@@ -29,7 +36,7 @@ def main(url: str) -> str:
|
|
29 |
# chunking to handle giving the model too large of an input which crashes
|
30 |
chunked = list(chunk(df.text))
|
31 |
|
32 |
-
nlp = pipeline('summarization')
|
33 |
|
34 |
lst_summaries = []
|
35 |
|
@@ -38,11 +45,13 @@ def main(url: str) -> str:
|
|
38 |
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
|
39 |
lst_summaries.append(result)
|
40 |
|
41 |
-
stext = ' '.join(lst_summaries)
|
42 |
-
|
43 |
-
# thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
|
44 |
|
45 |
-
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
if __name__ == "__main__":
|
@@ -50,13 +59,12 @@ if __name__ == "__main__":
|
|
50 |
with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
|
51 |
submission_url = gr.Textbox(label='Post URL')
|
52 |
|
|
|
|
|
53 |
sub_btn = gr.Button("Summarize")
|
54 |
|
55 |
summary = gr.Textbox(label='Comment Summary')
|
56 |
|
57 |
-
sub_btn.click(fn=main, inputs=submission_url, outputs=summary)
|
58 |
|
59 |
demo.launch()
|
60 |
-
# demo = gr.Interface(fn=main, inputs="text", outputs="text")
|
61 |
-
|
62 |
-
# demo.launch()
|
|
|
1 |
import pandas as pd
|
2 |
import gradio as gr
|
3 |
+
import re
|
4 |
from transformers import pipeline
|
5 |
from scraper import getComments
|
6 |
|
|
|
11 |
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
12 |
|
13 |
|
14 |
+
def preprocessText(df):
|
15 |
+
df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
|
16 |
+
df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
|
17 |
+
return df
|
18 |
+
|
19 |
+
|
20 |
+
def main(url: str, summary_length: str = "Short") -> str:
|
21 |
|
22 |
# pushshift.io submission comments api doesn't work so have to use praw
|
23 |
|
24 |
+
df = preprocessText(getComments(url=url))
|
25 |
|
26 |
smax = df.score.max()
|
27 |
|
|
|
36 |
# chunking to handle giving the model too large of an input which crashes
|
37 |
chunked = list(chunk(df.text))
|
38 |
|
39 |
+
nlp = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")
|
40 |
|
41 |
lst_summaries = []
|
42 |
|
|
|
45 |
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
|
46 |
lst_summaries.append(result)
|
47 |
|
48 |
+
stext = ' '.join(lst_summaries).replace(" .", ".")
|
|
|
|
|
49 |
|
50 |
+
if summary_length == "Short":
|
51 |
+
thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
|
52 |
+
return df.submission_title.unique()[0] + '\n' + '\n' + thread_summary
|
53 |
+
else:
|
54 |
+
return df.submission_title.unique()[0] + '\n' + '\n' + stext
|
55 |
|
56 |
|
57 |
if __name__ == "__main__":
|
|
|
59 |
with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
|
60 |
submission_url = gr.Textbox(label='Post URL')
|
61 |
|
62 |
+
length_choice = gr.Radio(label='Summary Length', value="Short", choices=["Short", "Long"])
|
63 |
+
|
64 |
sub_btn = gr.Button("Summarize")
|
65 |
|
66 |
summary = gr.Textbox(label='Comment Summary')
|
67 |
|
68 |
+
sub_btn.click(fn=main, inputs=[submission_url, length_choice], outputs=summary)
|
69 |
|
70 |
demo.launch()
|
|
|
|
|
|