prashant
commited on
Commit
·
f47e7d4
1
Parent(s):
8f1008c
reverting chnages
Browse files- udfPreprocess/cleaning.py +4 -16
- udfPreprocess/docPreprocessing.py +6 -6
- udfPreprocess/paramconfig.cfg +0 -12
- udfPreprocess/sdg.py +0 -57
- udfPreprocess/search.py +0 -145
- udfPreprocess/uploadAndExample.py +0 -48
udfPreprocess/cleaning.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import logging
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import string
|
@@ -11,7 +10,7 @@ import streamlit as st
|
|
11 |
from haystack.nodes import PreProcessor
|
12 |
|
13 |
'''basic cleaning - suitable for transformer models'''
|
14 |
-
def basic(s
|
15 |
"""
|
16 |
:param s: string to be processed
|
17 |
:return: processed string: see comments in the source code for more info
|
@@ -24,15 +23,6 @@ def basic(s,SDG = False):
|
|
24 |
# Remove URLs
|
25 |
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
26 |
s = re.sub(r"http\S+", " ", s)
|
27 |
-
if SDG == True:
|
28 |
-
s = s.lower()
|
29 |
-
translator = str.maketrans(' ', ' ', string.punctuation)
|
30 |
-
s = s.translate(translator)
|
31 |
-
s = re.sub('\n', ' ', s)
|
32 |
-
s = re.sub("\'", " ", s)
|
33 |
-
s = re.sub(r'\d+', ' ', s)
|
34 |
-
s = re.sub(r'\W+', ' ', s)
|
35 |
-
|
36 |
# Remove new line characters
|
37 |
#s = re.sub('\n', ' ', s)
|
38 |
|
@@ -69,10 +59,9 @@ def preprocessingForSDG(document):
|
|
69 |
for i in document:
|
70 |
docs_processed = preprocessor.process([i])
|
71 |
for item in docs_processed:
|
72 |
-
item.content = basic(item.content
|
73 |
|
74 |
-
|
75 |
-
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
|
76 |
|
77 |
# create dataframe of text and list of all text
|
78 |
df = pd.DataFrame(docs_processed)
|
@@ -104,8 +93,7 @@ def preprocessing(document):
|
|
104 |
for item in docs_processed:
|
105 |
item.content = basic(item.content)
|
106 |
|
107 |
-
|
108 |
-
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
|
109 |
|
110 |
# create dataframe of text and list of all text
|
111 |
df = pd.DataFrame(docs_processed)
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
import string
|
|
|
10 |
from haystack.nodes import PreProcessor
|
11 |
|
12 |
'''basic cleaning - suitable for transformer models'''
|
13 |
+
def basic(s):
|
14 |
"""
|
15 |
:param s: string to be processed
|
16 |
:return: processed string: see comments in the source code for more info
|
|
|
23 |
# Remove URLs
|
24 |
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
25 |
s = re.sub(r"http\S+", " ", s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
# Remove new line characters
|
27 |
#s = re.sub('\n', ' ', s)
|
28 |
|
|
|
59 |
for i in document:
|
60 |
docs_processed = preprocessor.process([i])
|
61 |
for item in docs_processed:
|
62 |
+
item.content = basic(item.content)
|
63 |
|
64 |
+
st.write("your document has been splitted to", len(docs_processed), "paragraphs")
|
|
|
65 |
|
66 |
# create dataframe of text and list of all text
|
67 |
df = pd.DataFrame(docs_processed)
|
|
|
93 |
for item in docs_processed:
|
94 |
item.content = basic(item.content)
|
95 |
|
96 |
+
st.write("your document has been splitted to", len(docs_processed), "paragraphs")
|
|
|
97 |
|
98 |
# create dataframe of text and list of all text
|
99 |
df = pd.DataFrame(docs_processed)
|
udfPreprocess/docPreprocessing.py
CHANGED
@@ -65,11 +65,11 @@ def load_document(
|
|
65 |
This can happen whith certain pdf types.'''
|
66 |
for i in documents:
|
67 |
if i.content == "":
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
|
75 |
return documents
|
|
|
65 |
This can happen whith certain pdf types.'''
|
66 |
for i in documents:
|
67 |
if i.content == "":
|
68 |
+
st.write("using pdfplumber")
|
69 |
+
text = []
|
70 |
+
with pdfplumber.open(file_path) as pdf:
|
71 |
+
for page in pdf.pages:
|
72 |
+
text.append(page.extract_text())
|
73 |
+
i.content = ' '.join([page for page in text])
|
74 |
|
75 |
return documents
|
udfPreprocess/paramconfig.cfg
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[lexical_search]
|
2 |
-
TOP_K = 10
|
3 |
-
THRESHOLD = 0.1
|
4 |
-
|
5 |
-
[semantic_search]
|
6 |
-
TOP_K = 10
|
7 |
-
MAX_SEQ_LENGTH = 64
|
8 |
-
MODEL_NAME = msmarco-distilbert-cos-v5
|
9 |
-
THRESHOLD = 0.1
|
10 |
-
|
11 |
-
[sdg]
|
12 |
-
THRESHOLD = 0.85
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
udfPreprocess/sdg.py
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
import glob, os, sys;
|
2 |
-
sys.path.append('../udfPreprocess')
|
3 |
-
|
4 |
-
#import helper
|
5 |
-
import udfPreprocess.docPreprocessing as pre
|
6 |
-
import udfPreprocess.cleaning as clean
|
7 |
-
|
8 |
-
#import needed libraries
|
9 |
-
import seaborn as sns
|
10 |
-
from pandas import DataFrame
|
11 |
-
from keybert import KeyBERT
|
12 |
-
from transformers import pipeline
|
13 |
-
import matplotlib.pyplot as plt
|
14 |
-
import numpy as np
|
15 |
-
import streamlit as st
|
16 |
-
import pandas as pd
|
17 |
-
import docx
|
18 |
-
from docx.shared import Inches
|
19 |
-
from docx.shared import Pt
|
20 |
-
from docx.enum.style import WD_STYLE_TYPE
|
21 |
-
|
22 |
-
import tempfile
|
23 |
-
import sqlite3
|
24 |
-
import logging
|
25 |
-
logger = logging.getLogger(__name__)
|
26 |
-
import configparser
|
27 |
-
|
28 |
-
@st.cache(allow_output_mutation=True)
|
29 |
-
def load_sdgClassifier():
|
30 |
-
classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
|
31 |
-
logging.info("Loading classifier")
|
32 |
-
return classifier
|
33 |
-
|
34 |
-
def sdg_classification(par_list):
|
35 |
-
logging.info("running SDG classifiication")
|
36 |
-
config = configparser.ConfigParser()
|
37 |
-
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
38 |
-
threshold = float(config.get('sdg','THRESHOLD'))
|
39 |
-
|
40 |
-
|
41 |
-
classifier = load_sdgClassifier()
|
42 |
-
labels = classifier(par_list)
|
43 |
-
|
44 |
-
labels_= [(l['label'],l['score']) for l in labels]
|
45 |
-
# df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
46 |
-
df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
47 |
-
|
48 |
-
df2['text'] = par_list
|
49 |
-
df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
50 |
-
df2.index += 1
|
51 |
-
df2 =df2[df2['Relevancy']>threshold]
|
52 |
-
x = df2['SDG'].value_counts()
|
53 |
-
df3 = df2.copy()
|
54 |
-
df3= df3.drop(['Relevancy'], axis = 1)
|
55 |
-
|
56 |
-
|
57 |
-
return df3, x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
udfPreprocess/search.py
DELETED
@@ -1,145 +0,0 @@
|
|
1 |
-
import glob, os, sys; sys.path.append('../udfPreprocess')
|
2 |
-
|
3 |
-
#import helper
|
4 |
-
import udfPreprocess.docPreprocessing as pre
|
5 |
-
import udfPreprocess.cleaning as clean
|
6 |
-
|
7 |
-
#import needed libraries
|
8 |
-
import seaborn as sns
|
9 |
-
from pandas import DataFrame
|
10 |
-
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
11 |
-
# from keybert import KeyBERT
|
12 |
-
from transformers import pipeline
|
13 |
-
import matplotlib.pyplot as plt
|
14 |
-
import numpy as np
|
15 |
-
import streamlit as st
|
16 |
-
import pandas as pd
|
17 |
-
from rank_bm25 import BM25Okapi
|
18 |
-
from sklearn.feature_extraction import _stop_words
|
19 |
-
import string
|
20 |
-
from tqdm.autonotebook import tqdm
|
21 |
-
import numpy as np
|
22 |
-
import docx
|
23 |
-
from docx.shared import Inches
|
24 |
-
from docx.shared import Pt
|
25 |
-
from docx.enum.style import WD_STYLE_TYPE
|
26 |
-
import logging
|
27 |
-
logger = logging.getLogger(__name__)
|
28 |
-
import tempfile
|
29 |
-
import sqlite3
|
30 |
-
import configparser
|
31 |
-
|
32 |
-
### These are lexcial search related functions/methods#####
|
33 |
-
|
34 |
-
def bm25_tokenizer(text):
|
35 |
-
tokenized_doc = []
|
36 |
-
for token in text.lower().split():
|
37 |
-
token = token.strip(string.punctuation)
|
38 |
-
|
39 |
-
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
40 |
-
tokenized_doc.append(token)
|
41 |
-
return tokenized_doc
|
42 |
-
|
43 |
-
def bm25TokenizeDoc(paraList):
|
44 |
-
tokenized_corpus = []
|
45 |
-
##########Commenting this for now########### will incorporate paragrpah splitting later.
|
46 |
-
# for passage in tqdm(paraList):
|
47 |
-
# if len(passage.split()) >256:
|
48 |
-
# # st.write("Splitting")
|
49 |
-
# temp = " ".join(passage.split()[:256])
|
50 |
-
# tokenized_corpus.append(bm25_tokenizer(temp))
|
51 |
-
# temp = " ".join(passage.split()[256:])
|
52 |
-
# tokenized_corpus.append(bm25_tokenizer(temp))
|
53 |
-
# else:
|
54 |
-
# tokenized_corpus.append(bm25_tokenizer(passage))
|
55 |
-
######################################################################################33333
|
56 |
-
for passage in tqdm(paraList):
|
57 |
-
tokenized_corpus.append(bm25_tokenizer(passage))
|
58 |
-
|
59 |
-
return tokenized_corpus
|
60 |
-
|
61 |
-
def lexical_search(keyword, document_bm25):
|
62 |
-
config = configparser.ConfigParser()
|
63 |
-
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
64 |
-
top_k = int(config.get('lexical_search','TOP_K'))
|
65 |
-
bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
66 |
-
top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
|
67 |
-
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
68 |
-
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
69 |
-
return bm25_hits
|
70 |
-
|
71 |
-
@st.cache(allow_output_mutation=True)
|
72 |
-
def load_sentenceTransformer(name):
|
73 |
-
return SentenceTransformer(name)
|
74 |
-
|
75 |
-
|
76 |
-
def semantic_search(keywordlist,paraList):
|
77 |
-
|
78 |
-
##### Sematic Search #####
|
79 |
-
#query = "Does document contain {} issues ?".format(keyword)
|
80 |
-
config = configparser.ConfigParser()
|
81 |
-
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
82 |
-
model_name = config.get('semantic_search','MODEL_NAME')
|
83 |
-
|
84 |
-
bi_encoder = load_sentenceTransformer(model_name)
|
85 |
-
bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
|
86 |
-
top_k = int(config.get('semantic_search','TOP_K'))
|
87 |
-
document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
88 |
-
question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
|
89 |
-
|
90 |
-
hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
91 |
-
|
92 |
-
return hits
|
93 |
-
|
94 |
-
def show_results(keywordList):
|
95 |
-
document = docx.Document()
|
96 |
-
# document.add_heading('Document name:{}'.format(file_name), 2)
|
97 |
-
section = document.sections[0]
|
98 |
-
|
99 |
-
# Calling the footer
|
100 |
-
footer = section.footer
|
101 |
-
|
102 |
-
# Calling the paragraph already present in
|
103 |
-
# the footer section
|
104 |
-
footer_para = footer.paragraphs[0]
|
105 |
-
|
106 |
-
font_styles = document.styles
|
107 |
-
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
108 |
-
font_object = font_charstyle.font
|
109 |
-
font_object.size = Pt(7)
|
110 |
-
# Adding the centered zoned footer
|
111 |
-
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
112 |
-
document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
|
113 |
-
for keyword in keywordList:
|
114 |
-
|
115 |
-
st.write("Results for Query: {}".format(keyword))
|
116 |
-
para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
|
117 |
-
para.font.size = Pt(12)
|
118 |
-
bm25_hits, hits = search(keyword)
|
119 |
-
|
120 |
-
st.markdown("""
|
121 |
-
We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
122 |
-
""")
|
123 |
-
# In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
124 |
-
st.markdown("Top few lexical search (BM25) hits")
|
125 |
-
document.add_paragraph("Top few lexical search (BM25) hits")
|
126 |
-
|
127 |
-
for hit in bm25_hits[0:5]:
|
128 |
-
if hit['score'] > 0.00:
|
129 |
-
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
130 |
-
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
# st.table(bm25_hits[0:3])
|
135 |
-
|
136 |
-
st.markdown("\n-------------------------\n")
|
137 |
-
st.markdown("Top few Bi-Encoder Retrieval hits")
|
138 |
-
document.add_paragraph("\n-------------------------\n")
|
139 |
-
document.add_paragraph("Top few Bi-Encoder Retrieval hits")
|
140 |
-
|
141 |
-
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
142 |
-
for hit in hits[0:5]:
|
143 |
-
# if hit['score'] > 0.45:
|
144 |
-
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
145 |
-
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
udfPreprocess/uploadAndExample.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import tempfile
|
3 |
-
import udfPreprocess.docPreprocessing as pre
|
4 |
-
import udfPreprocess.cleaning as clean
|
5 |
-
|
6 |
-
def add_upload(choice):
|
7 |
-
|
8 |
-
|
9 |
-
if choice == 'Upload Document':
|
10 |
-
uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
11 |
-
if uploaded_file is not None:
|
12 |
-
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
13 |
-
bytes_data = uploaded_file.getvalue()
|
14 |
-
temp.write(bytes_data)
|
15 |
-
st.session_state['filename'] = uploaded_file.name
|
16 |
-
# st.write("Uploaded Filename: ", uploaded_file.name)
|
17 |
-
file_name = uploaded_file.name
|
18 |
-
file_path = temp.name
|
19 |
-
docs = pre.load_document(file_path, file_name)
|
20 |
-
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
21 |
-
st.session_state['docs'] = docs
|
22 |
-
st.session_state['paraList'] = paraList
|
23 |
-
|
24 |
-
|
25 |
-
else:
|
26 |
-
# listing the options
|
27 |
-
option = st.sidebar.selectbox('Select the example document',
|
28 |
-
('South Africa:Low Emission strategy',
|
29 |
-
'Ethiopia: 10 Year Development Plan'))
|
30 |
-
if option is 'South Africa:Low Emission strategy':
|
31 |
-
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
32 |
-
st.session_state['filename'] = file_name
|
33 |
-
# st.write("Selected document:", file_name.split('/')[1])
|
34 |
-
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
35 |
-
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
36 |
-
else:
|
37 |
-
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
38 |
-
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
39 |
-
st.session_state['filename'] = file_name
|
40 |
-
# st.write("Selected document:", file_name.split('/')[1])
|
41 |
-
|
42 |
-
if option is not None:
|
43 |
-
docs = pre.load_document(file_path,file_name)
|
44 |
-
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
45 |
-
st.session_state['docs'] = docs
|
46 |
-
st.session_state['paraList'] = paraList
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|