kkastr commited on
Commit
3b3dbc9
1 Parent(s): 3cc406f
scraper.py CHANGED
@@ -1,6 +1,5 @@
1
  import praw
2
  import pandas as pd
3
- from tqdm import tqdm
4
  from api_keys import client_id, client_secret, user_agent, username
5
 
6
 
 
1
  import praw
2
  import pandas as pd
 
3
  from api_keys import client_id, client_secret, user_agent, username
4
 
5
 
thread_analyzer.py → thread_summarizer.py RENAMED
@@ -1,5 +1,6 @@
1
  import pandas as pd
2
  import gradio as gr
 
3
  from transformers import pipeline
4
  from scraper import getComments
5
 
@@ -10,11 +11,17 @@ def chunk(a):
10
  return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
11
 
12
 
13
- def main(url: str) -> str:
 
 
 
 
 
 
14
 
15
  # pushshift.io submission comments api doesn't work so have to use praw
16
 
17
- df = getComments(url=url)
18
 
19
  smax = df.score.max()
20
 
@@ -29,7 +36,7 @@ def main(url: str) -> str:
29
  # chunking to handle giving the model too large of an input which crashes
30
  chunked = list(chunk(df.text))
31
 
32
- nlp = pipeline('summarization')
33
 
34
  lst_summaries = []
35
 
@@ -38,11 +45,13 @@ def main(url: str) -> str:
38
  result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
39
  lst_summaries.append(result)
40
 
41
- stext = ' '.join(lst_summaries)
42
-
43
- # thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
44
 
45
- return df.submission_title.unique()[0] + '\n' + '\n' + stext
 
 
 
 
46
 
47
 
48
  if __name__ == "__main__":
@@ -50,13 +59,12 @@ if __name__ == "__main__":
50
  with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
51
  submission_url = gr.Textbox(label='Post URL')
52
 
 
 
53
  sub_btn = gr.Button("Summarize")
54
 
55
  summary = gr.Textbox(label='Comment Summary')
56
 
57
- sub_btn.click(fn=main, inputs=submission_url, outputs=summary)
58
 
59
  demo.launch()
60
- # demo = gr.Interface(fn=main, inputs="text", outputs="text")
61
-
62
- # demo.launch()
 
1
  import pandas as pd
2
  import gradio as gr
3
+ import re
4
  from transformers import pipeline
5
  from scraper import getComments
6
 
 
11
  return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
12
 
13
 
14
+ def preprocessText(df):
15
+ df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
16
+ df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
17
+ return df
18
+
19
+
20
+ def main(url: str, summary_length: str = "Short") -> str:
21
 
22
  # pushshift.io submission comments api doesn't work so have to use praw
23
 
24
+ df = preprocessText(getComments(url=url))
25
 
26
  smax = df.score.max()
27
 
 
36
  # chunking to handle giving the model too large of an input which crashes
37
  chunked = list(chunk(df.text))
38
 
39
+ nlp = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")
40
 
41
  lst_summaries = []
42
 
 
45
  result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
46
  lst_summaries.append(result)
47
 
48
+ stext = ' '.join(lst_summaries).replace(" .", ".")
 
 
49
 
50
+ if summary_length == "Short":
51
+ thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
52
+ return df.submission_title.unique()[0] + '\n' + '\n' + thread_summary
53
+ else:
54
+ return df.submission_title.unique()[0] + '\n' + '\n' + stext
55
 
56
 
57
  if __name__ == "__main__":
 
59
  with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
60
  submission_url = gr.Textbox(label='Post URL')
61
 
62
+ length_choice = gr.Radio(label='Summary Length', value="Short", choices=["Short", "Long"])
63
+
64
  sub_btn = gr.Button("Summarize")
65
 
66
  summary = gr.Textbox(label='Comment Summary')
67
 
68
+ sub_btn.click(fn=main, inputs=[submission_url, length_choice], outputs=summary)
69
 
70
  demo.launch()