Archan commited on
Commit
d8f23da
·
1 Parent(s): 46c7287

added main file and requirements.txt

Browse files
Files changed (2) hide show
  1. requirements.txt +85 -0
  2. strm.py +141 -0
requirements.txt ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.1
2
+ aiosignal==1.3.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.1.0
6
+ arxiv==2.0.0
7
+ attrs==23.1.0
8
+ blinker==1.7.0
9
+ cachetools==5.3.2
10
+ certifi==2023.11.17
11
+ charset-normalizer==3.3.2
12
+ click==8.1.7
13
+ colorama==0.4.6
14
+ dataclasses-json==0.6.3
15
+ feedparser==6.0.10
16
+ filelock==3.13.1
17
+ frozenlist==1.4.0
18
+ fsspec==2023.12.1
19
+ gitdb==4.0.11
20
+ GitPython==3.1.40
21
+ greenlet==3.0.2
22
+ huggingface-hub==0.19.4
23
+ idna==3.6
24
+ importlib-metadata==6.11.0
25
+ Jinja2==3.1.2
26
+ jsonpatch==1.33
27
+ jsonpointer==2.4
28
+ jsonschema==4.20.0
29
+ jsonschema-specifications==2023.11.2
30
+ langchain==0.0.348
31
+ langchain-core==0.0.12
32
+ langsmith==0.0.69
33
+ markdown-it-py==3.0.0
34
+ MarkupSafe==2.1.3
35
+ marshmallow==3.20.1
36
+ mdurl==0.1.2
37
+ mpmath==1.3.0
38
+ multidict==6.0.4
39
+ mypy-extensions==1.0.0
40
+ networkx==3.2.1
41
+ numpy==1.26.2
42
+ packaging==23.2
43
+ pandas==2.1.4
44
+ Pillow==10.1.0
45
+ protobuf==4.25.1
46
+ pyarrow==14.0.1
47
+ pydantic==2.5.2
48
+ pydantic_core==2.14.5
49
+ pydeck==0.8.1b0
50
+ Pygments==2.17.2
51
+ PyMuPDF==1.23.7
52
+ PyMuPDFb==1.23.7
53
+ python-dateutil==2.8.2
54
+ pytz==2023.3.post1
55
+ PyYAML==6.0.1
56
+ referencing==0.32.0
57
+ regex==2023.10.3
58
+ requests==2.31.0
59
+ rich==13.7.0
60
+ rpds-py==0.13.2
61
+ safetensors==0.4.1
62
+ sgmllib3k==1.0.0
63
+ six==1.16.0
64
+ smmap==5.0.1
65
+ sniffio==1.3.0
66
+ SQLAlchemy==2.0.23
67
+ streamlit==1.29.0
68
+ sympy==1.12
69
+ tenacity==8.2.3
70
+ tokenizers==0.15.0
71
+ toml==0.10.2
72
+ toolz==0.12.0
73
+ torch==2.1.1
74
+ tornado==6.4
75
+ tqdm==4.66.1
76
+ transformers==4.36.0
77
+ typing-inspect==0.9.0
78
+ typing_extensions==4.9.0
79
+ tzdata==2023.3
80
+ tzlocal==5.2
81
+ urllib3==2.1.0
82
+ validators==0.22.0
83
+ watchdog==3.0.0
84
+ yarl==1.9.4
85
+ zipp==3.17.0
strm.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.document_loaders import ArxivLoader
3
+ from transformers import pipeline
4
+
5
+
6
+
7
+ def strip(content):
8
+ content = str(content)
9
+ #print(content)
10
+ content = content.split("\n")
11
+ content = " ".join(content)
12
+ #print(content)
13
+
14
+ return content
15
+
16
+ def clip(content):
17
+ loc_intro = content.find("Introduction")
18
+ loc_refer = content.rfind("Reference")
19
+ if loc_intro !=-1:
20
+ if loc_refer !=-1:
21
+ content = content[loc_intro:loc_refer]
22
+ else:
23
+ content = content[loc_intro:]
24
+ print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary")
25
+ else:
26
+ print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization")
27
+
28
+ return content
29
+
30
+
31
+ def chunk(content):
32
+
33
+ print("-----Clipping content between Intro and References--------")
34
+
35
+ content = clip(content)
36
+
37
+ sent = []
38
+ c= 0
39
+ k = ""
40
+ content = content.split(". ")
41
+ for i in range(len(content)):
42
+ k = k + content[i] + ". "
43
+ c = c+1
44
+ if c == 10:
45
+ sent.append(k)
46
+ c = 0
47
+ k = ""
48
+ elif i==len(content)-1:
49
+ sent.append(k)
50
+
51
+ return sent
52
+
53
+
54
+ def summarize(sent):
55
+ model_str = "Falconsai/text_summarization"
56
+ tokenizer_str = "Falconsai/text_summarization"
57
+
58
+ summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str)
59
+
60
+
61
+ summarized = ""
62
+ for i in sent:
63
+ s = summarizer(i, max_length=256, min_length=64, do_sample=False)
64
+ summarized = summarized + s[0]['summary_text'] +"\n"
65
+
66
+ return summarized
67
+
68
+ def doc_load(search_query="default", n_docs=1):
69
+ if search_query == "default":
70
+ return [" ", " "], [" ", " "], [" ", " "]
71
+ try :
72
+ print("-------searching Paper----------")
73
+ docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load()
74
+ titles = []
75
+ n_pairs = {}
76
+ for i in range(n_docs):
77
+ title = docs[i].metadata['Title']
78
+ titles.append(title)
79
+ n_pairs[title] = i
80
+ return titles, docs, n_pairs
81
+ except Exception as e:
82
+ print("--------ERROR while Trying to Search Paper-------------")
83
+ print(e)
84
+
85
+
86
+ def run(choice, docs, n_pairs):
87
+ ch = n_pairs[choice]
88
+ st.text("Fetching Metadata")
89
+ print("-----fetching metadata-------------")
90
+ metadata = docs[ch].metadata
91
+ content = docs[ch].page_content
92
+
93
+ print("----stripping new lines----------")
94
+ content = strip(content)
95
+ print("-----------chunking content--------------")
96
+ sent = chunk(content)
97
+ st.text("Chunking Text....")
98
+ st.text("🤔 Shortening text...")
99
+ print("----summarizing content---------")
100
+ summarized = summarize(sent)
101
+
102
+
103
+ out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized
104
+ return out
105
+
106
+ st.title("ArXiV Summarizer")
107
+ titles = []
108
+ with st.form(key="search_form"):
109
+ col1, col2 = st.columns(2)
110
+ with col1:
111
+ search_query = st.text_input("Search Using Paper ID or Name*")
112
+ with col2:
113
+ n_docs = st.selectbox(label="Number of Documents to Load", options=(1, 2, 3, 4, 5, 6, 7, 8, 9 ,10))
114
+ submit = st.form_submit_button(label="Search")
115
+ if submit:
116
+ c = "Fetching Papers 🤔 "
117
+ st.write(c)
118
+ try:
119
+ titles, docs, n_pairs = doc_load(search_query=search_query, n_docs=n_docs)
120
+ except Exception as e:
121
+ print(e)
122
+ if titles:
123
+ c = "Papers Fetched 🤩 "
124
+ st.write(c)
125
+ else:
126
+ c = "Error while Fetching Papers 😥 Please Check ID or Name"
127
+ st.write(c)
128
+
129
+ label = "Papers for " + search_query
130
+ with st.form(key="paper_form"):
131
+ paper_name = st.selectbox(label=label, options=titles)
132
+ submit_paper = st.form_submit_button(label="Fetch Paper & Summarize")
133
+ print(submit_paper)
134
+ if submit_paper:
135
+ st.text("Reading Document.... 📄 ")
136
+ output = run(paper_name, docs, n_pairs)
137
+ st.text_area(label = "Summary", value=output, height = 650)
138
+
139
+ st.text('''* - Please Use Paper ID (Example : 2301.10172) as it will give accurate results.
140
+ Free text search can give errors sometimes''')
141
+ st.text("While using Paper ID no need to change Number of Documents to load")