kkastr
commited on
Commit
•
1d197a9
1
Parent(s):
3b3dbc9
rename files. added prelims for deployment on aws (pending permission fix)
Browse files- .gitignore +2 -0
- Dockerfile +13 -0
- thread_summarizer.py → app.py +5 -5
- cdk.json +3 -0
- cdk.py +34 -0
- download_model.py +8 -0
- requirements.txt +9 -0
- scraper.py +13 -3
.gitignore
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
data/
|
2 |
model/
|
3 |
archive/
|
|
|
|
|
4 |
api_keys.py
|
5 |
*.csv
|
6 |
|
|
|
1 |
data/
|
2 |
model/
|
3 |
archive/
|
4 |
+
ckd.out
|
5 |
+
|
6 |
api_keys.py
|
7 |
*.csv
|
8 |
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Dockerfile
|
2 |
+
FROM public.ecr.aws/docker/library/python:3.9.16-slim-buster
|
3 |
+
|
4 |
+
COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.6.0 /lambda-adapter /opt/extensions/lambda-adapter
|
5 |
+
WORKDIR /var/task
|
6 |
+
|
7 |
+
COPY requirements.txt ./requirements.txt
|
8 |
+
RUN python -m pip install -r requirements.txt
|
9 |
+
|
10 |
+
COPY app.py ./
|
11 |
+
COPY scraper.py ./
|
12 |
+
COPY model/ ./model/
|
13 |
+
CMD ["python3", "app.py"]
|
thread_summarizer.py → app.py
RENAMED
@@ -17,7 +17,7 @@ def preprocessText(df):
|
|
17 |
return df
|
18 |
|
19 |
|
20 |
-
def
|
21 |
|
22 |
# pushshift.io submission comments api doesn't work so have to use praw
|
23 |
|
@@ -29,14 +29,14 @@ def main(url: str, summary_length: str = "Short") -> str:
|
|
29 |
|
30 |
df = df[df.score >= threshold]
|
31 |
|
32 |
-
# empirically, having more than 200 comments doesn't change much
|
33 |
if len(df.text) >= 200:
|
34 |
df = df[:200]
|
35 |
|
36 |
# chunking to handle giving the model too large of an input which crashes
|
37 |
chunked = list(chunk(df.text))
|
38 |
|
39 |
-
nlp = pipeline('summarization', model="
|
40 |
|
41 |
lst_summaries = []
|
42 |
|
@@ -65,6 +65,6 @@ if __name__ == "__main__":
|
|
65 |
|
66 |
summary = gr.Textbox(label='Comment Summary')
|
67 |
|
68 |
-
sub_btn.click(fn=
|
69 |
|
70 |
-
demo.launch()
|
|
|
17 |
return df
|
18 |
|
19 |
|
20 |
+
def summarizer(url: str, summary_length: str = "Short") -> str:
|
21 |
|
22 |
# pushshift.io submission comments api doesn't work so have to use praw
|
23 |
|
|
|
29 |
|
30 |
df = df[df.score >= threshold]
|
31 |
|
32 |
+
# empirically, having more than 200 comments doesn't change much but slows down the summarizer.
|
33 |
if len(df.text) >= 200:
|
34 |
df = df[:200]
|
35 |
|
36 |
# chunking to handle giving the model too large of an input which crashes
|
37 |
chunked = list(chunk(df.text))
|
38 |
|
39 |
+
nlp = pipeline('summarization', model="./model/")
|
40 |
|
41 |
lst_summaries = []
|
42 |
|
|
|
65 |
|
66 |
summary = gr.Textbox(label='Comment Summary')
|
67 |
|
68 |
+
sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
|
69 |
|
70 |
+
demo.launch(server_port=8080, enable_queue=False)
|
cdk.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"app": "python3 cdk.py"
|
3 |
+
}
|
cdk.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
from constructs import Construct
|
4 |
+
from aws_cdk import App, Stack, Environment, Duration, CfnOutput
|
5 |
+
from aws_cdk.aws_lambda import DockerImageFunction, DockerImageCode
|
6 |
+
from aws_cdk.aws_lambda import Architecture, FunctionUrlAuthType
|
7 |
+
|
8 |
+
my_environment = Environment(
|
9 |
+
account=os.environ["CDK_DEFAULT_ACCOUNT"],
|
10 |
+
region=os.environ["CDK_DEFAULT_REGION"])
|
11 |
+
|
12 |
+
|
13 |
+
class GradioLambda(Stack):
|
14 |
+
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
|
15 |
+
super().__init__(scope, construct_id, **kwargs)
|
16 |
+
|
17 |
+
# create function
|
18 |
+
lambda_fn = DockerImageFunction(
|
19 |
+
self,
|
20 |
+
"GradioApp",
|
21 |
+
code=DockerImageCode.from_image_asset(str(Path.cwd()), file="Dockerfile"),
|
22 |
+
architecture=Architecture.X86_64,
|
23 |
+
memory_size=3008,
|
24 |
+
timeout=Duration.minutes(2),
|
25 |
+
)
|
26 |
+
# add HTTPS url
|
27 |
+
fn_url = lambda_fn.add_function_url(auth_type=FunctionUrlAuthType.NONE)
|
28 |
+
CfnOutput(self, "functionUrl", value=fn_url.url)
|
29 |
+
|
30 |
+
|
31 |
+
app = App()
|
32 |
+
rust_lambda = GradioLambda(app, "GradioLambda", env=my_environment)
|
33 |
+
|
34 |
+
app.synth()
|
download_model.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
|
3 |
+
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
4 |
+
|
5 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
6 |
+
|
7 |
+
tokenizer.save_pretrained("./model")
|
8 |
+
model.save_pretrained("./model")
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
boto3==1.21.32
|
2 |
+
constructs==10.1.263
|
3 |
+
gradio==3.19.1
|
4 |
+
pandas==1.4.2
|
5 |
+
praw==7.6.0
|
6 |
+
transformers==4.26.1
|
7 |
+
|
8 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
9 |
+
torch==1.13.0+cpu
|
scraper.py
CHANGED
@@ -1,10 +1,16 @@
|
|
1 |
import praw
|
|
|
2 |
import pandas as pd
|
3 |
-
|
4 |
|
5 |
|
6 |
def getComments(url):
|
7 |
|
|
|
|
|
|
|
|
|
|
|
8 |
cols = [
|
9 |
"text",
|
10 |
"score",
|
@@ -16,10 +22,14 @@ def getComments(url):
|
|
16 |
]
|
17 |
|
18 |
reddit = praw.Reddit(
|
19 |
-
client_id=
|
20 |
)
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
23 |
submission.comments.replace_more(limit=0)
|
24 |
rows = []
|
25 |
|
|
|
1 |
import praw
|
2 |
+
import praw.exceptions as redditexception
|
3 |
import pandas as pd
|
4 |
+
import boto3
|
5 |
|
6 |
|
7 |
def getComments(url):
|
8 |
|
9 |
+
ssm = boto3.client('ssm')
|
10 |
+
cid = ssm.get_parameter(Name='client_id', WithDecryption=True)['Parameter']['Value']
|
11 |
+
csecret = ssm.get_parameter(Name='client_secret', WithDecryption=True)['Parameter']['Value']
|
12 |
+
user_agent = ssm.get_parameter(Name='user_agent', WithDecryption=True)['Parameter']['Value']
|
13 |
+
|
14 |
cols = [
|
15 |
"text",
|
16 |
"score",
|
|
|
22 |
]
|
23 |
|
24 |
reddit = praw.Reddit(
|
25 |
+
client_id=cid , client_secret=csecret, user_agent=user_agent
|
26 |
)
|
27 |
|
28 |
+
try:
|
29 |
+
submission = reddit.submission(url=url)
|
30 |
+
except redditexception.InvalidURL:
|
31 |
+
print("The URL is invalid. Make sure that you have included the submission id")
|
32 |
+
|
33 |
submission.comments.replace_more(limit=0)
|
34 |
rows = []
|
35 |
|