prashant commited on
Commit
f47e7d4
·
1 Parent(s): 8f1008c

reverting chnages

Browse files
udfPreprocess/cleaning.py CHANGED
@@ -1,4 +1,3 @@
1
- import logging
2
  import pandas as pd
3
  import numpy as np
4
  import string
@@ -11,7 +10,7 @@ import streamlit as st
11
  from haystack.nodes import PreProcessor
12
 
13
  '''basic cleaning - suitable for transformer models'''
14
- def basic(s,SDG = False):
15
  """
16
  :param s: string to be processed
17
  :return: processed string: see comments in the source code for more info
@@ -24,15 +23,6 @@ def basic(s,SDG = False):
24
  # Remove URLs
25
  s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
26
  s = re.sub(r"http\S+", " ", s)
27
- if SDG == True:
28
- s = s.lower()
29
- translator = str.maketrans(' ', ' ', string.punctuation)
30
- s = s.translate(translator)
31
- s = re.sub('\n', ' ', s)
32
- s = re.sub("\'", " ", s)
33
- s = re.sub(r'\d+', ' ', s)
34
- s = re.sub(r'\W+', ' ', s)
35
-
36
  # Remove new line characters
37
  #s = re.sub('\n', ' ', s)
38
 
@@ -69,10 +59,9 @@ def preprocessingForSDG(document):
69
  for i in document:
70
  docs_processed = preprocessor.process([i])
71
  for item in docs_processed:
72
- item.content = basic(item.content, SDG = True)
73
 
74
- with st.spinner("👑 document being splitted into paragraphs"):
75
- logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
76
 
77
  # create dataframe of text and list of all text
78
  df = pd.DataFrame(docs_processed)
@@ -104,8 +93,7 @@ def preprocessing(document):
104
  for item in docs_processed:
105
  item.content = basic(item.content)
106
 
107
- with st.spinner("👑 document being splitted into paragraphs"):
108
- logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
109
 
110
  # create dataframe of text and list of all text
111
  df = pd.DataFrame(docs_processed)
 
 
1
  import pandas as pd
2
  import numpy as np
3
  import string
 
10
  from haystack.nodes import PreProcessor
11
 
12
  '''basic cleaning - suitable for transformer models'''
13
+ def basic(s):
14
  """
15
  :param s: string to be processed
16
  :return: processed string: see comments in the source code for more info
 
23
  # Remove URLs
24
  s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
25
  s = re.sub(r"http\S+", " ", s)
 
 
 
 
 
 
 
 
 
26
  # Remove new line characters
27
  #s = re.sub('\n', ' ', s)
28
 
 
59
  for i in document:
60
  docs_processed = preprocessor.process([i])
61
  for item in docs_processed:
62
+ item.content = basic(item.content)
63
 
64
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
 
65
 
66
  # create dataframe of text and list of all text
67
  df = pd.DataFrame(docs_processed)
 
93
  for item in docs_processed:
94
  item.content = basic(item.content)
95
 
96
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
 
97
 
98
  # create dataframe of text and list of all text
99
  df = pd.DataFrame(docs_processed)
udfPreprocess/docPreprocessing.py CHANGED
@@ -65,11 +65,11 @@ def load_document(
65
  This can happen whith certain pdf types.'''
66
  for i in documents:
67
  if i.content == "":
68
- with st.spinner("using pdfplumber"):
69
- text = []
70
- with pdfplumber.open(file_path) as pdf:
71
- for page in pdf.pages:
72
- text.append(page.extract_text())
73
- i.content = ' '.join([page for page in text])
74
 
75
  return documents
 
65
  This can happen whith certain pdf types.'''
66
  for i in documents:
67
  if i.content == "":
68
+ st.write("using pdfplumber")
69
+ text = []
70
+ with pdfplumber.open(file_path) as pdf:
71
+ for page in pdf.pages:
72
+ text.append(page.extract_text())
73
+ i.content = ' '.join([page for page in text])
74
 
75
  return documents
udfPreprocess/paramconfig.cfg DELETED
@@ -1,12 +0,0 @@
1
- [lexical_search]
2
- TOP_K = 10
3
- THRESHOLD = 0.1
4
-
5
- [semantic_search]
6
- TOP_K = 10
7
- MAX_SEQ_LENGTH = 64
8
- MODEL_NAME = msmarco-distilbert-cos-v5
9
- THRESHOLD = 0.1
10
-
11
- [sdg]
12
- THRESHOLD = 0.85
 
 
 
 
 
 
 
 
 
 
 
 
 
udfPreprocess/sdg.py DELETED
@@ -1,57 +0,0 @@
1
- import glob, os, sys;
2
- sys.path.append('../udfPreprocess')
3
-
4
- #import helper
5
- import udfPreprocess.docPreprocessing as pre
6
- import udfPreprocess.cleaning as clean
7
-
8
- #import needed libraries
9
- import seaborn as sns
10
- from pandas import DataFrame
11
- from keybert import KeyBERT
12
- from transformers import pipeline
13
- import matplotlib.pyplot as plt
14
- import numpy as np
15
- import streamlit as st
16
- import pandas as pd
17
- import docx
18
- from docx.shared import Inches
19
- from docx.shared import Pt
20
- from docx.enum.style import WD_STYLE_TYPE
21
-
22
- import tempfile
23
- import sqlite3
24
- import logging
25
- logger = logging.getLogger(__name__)
26
- import configparser
27
-
28
- @st.cache(allow_output_mutation=True)
29
- def load_sdgClassifier():
30
- classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
31
- logging.info("Loading classifier")
32
- return classifier
33
-
34
- def sdg_classification(par_list):
35
- logging.info("running SDG classifiication")
36
- config = configparser.ConfigParser()
37
- config.read_file(open('udfPreprocess/paramconfig.cfg'))
38
- threshold = float(config.get('sdg','THRESHOLD'))
39
-
40
-
41
- classifier = load_sdgClassifier()
42
- labels = classifier(par_list)
43
-
44
- labels_= [(l['label'],l['score']) for l in labels]
45
- # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
46
- df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
47
-
48
- df2['text'] = par_list
49
- df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
50
- df2.index += 1
51
- df2 =df2[df2['Relevancy']>threshold]
52
- x = df2['SDG'].value_counts()
53
- df3 = df2.copy()
54
- df3= df3.drop(['Relevancy'], axis = 1)
55
-
56
-
57
- return df3, x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
udfPreprocess/search.py DELETED
@@ -1,145 +0,0 @@
1
- import glob, os, sys; sys.path.append('../udfPreprocess')
2
-
3
- #import helper
4
- import udfPreprocess.docPreprocessing as pre
5
- import udfPreprocess.cleaning as clean
6
-
7
- #import needed libraries
8
- import seaborn as sns
9
- from pandas import DataFrame
10
- from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
- # from keybert import KeyBERT
12
- from transformers import pipeline
13
- import matplotlib.pyplot as plt
14
- import numpy as np
15
- import streamlit as st
16
- import pandas as pd
17
- from rank_bm25 import BM25Okapi
18
- from sklearn.feature_extraction import _stop_words
19
- import string
20
- from tqdm.autonotebook import tqdm
21
- import numpy as np
22
- import docx
23
- from docx.shared import Inches
24
- from docx.shared import Pt
25
- from docx.enum.style import WD_STYLE_TYPE
26
- import logging
27
- logger = logging.getLogger(__name__)
28
- import tempfile
29
- import sqlite3
30
- import configparser
31
-
32
- ### These are lexcial search related functions/methods#####
33
-
34
- def bm25_tokenizer(text):
35
- tokenized_doc = []
36
- for token in text.lower().split():
37
- token = token.strip(string.punctuation)
38
-
39
- if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
40
- tokenized_doc.append(token)
41
- return tokenized_doc
42
-
43
- def bm25TokenizeDoc(paraList):
44
- tokenized_corpus = []
45
- ##########Commenting this for now########### will incorporate paragrpah splitting later.
46
- # for passage in tqdm(paraList):
47
- # if len(passage.split()) >256:
48
- # # st.write("Splitting")
49
- # temp = " ".join(passage.split()[:256])
50
- # tokenized_corpus.append(bm25_tokenizer(temp))
51
- # temp = " ".join(passage.split()[256:])
52
- # tokenized_corpus.append(bm25_tokenizer(temp))
53
- # else:
54
- # tokenized_corpus.append(bm25_tokenizer(passage))
55
- ######################################################################################33333
56
- for passage in tqdm(paraList):
57
- tokenized_corpus.append(bm25_tokenizer(passage))
58
-
59
- return tokenized_corpus
60
-
61
- def lexical_search(keyword, document_bm25):
62
- config = configparser.ConfigParser()
63
- config.read_file(open('udfPreprocess/paramconfig.cfg'))
64
- top_k = int(config.get('lexical_search','TOP_K'))
65
- bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
66
- top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
67
- bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
68
- bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
69
- return bm25_hits
70
-
71
- @st.cache(allow_output_mutation=True)
72
- def load_sentenceTransformer(name):
73
- return SentenceTransformer(name)
74
-
75
-
76
- def semantic_search(keywordlist,paraList):
77
-
78
- ##### Sematic Search #####
79
- #query = "Does document contain {} issues ?".format(keyword)
80
- config = configparser.ConfigParser()
81
- config.read_file(open('udfPreprocess/paramconfig.cfg'))
82
- model_name = config.get('semantic_search','MODEL_NAME')
83
-
84
- bi_encoder = load_sentenceTransformer(model_name)
85
- bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
86
- top_k = int(config.get('semantic_search','TOP_K'))
87
- document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
88
- question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
89
-
90
- hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
91
-
92
- return hits
93
-
94
- def show_results(keywordList):
95
- document = docx.Document()
96
- # document.add_heading('Document name:{}'.format(file_name), 2)
97
- section = document.sections[0]
98
-
99
- # Calling the footer
100
- footer = section.footer
101
-
102
- # Calling the paragraph already present in
103
- # the footer section
104
- footer_para = footer.paragraphs[0]
105
-
106
- font_styles = document.styles
107
- font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
108
- font_object = font_charstyle.font
109
- font_object.size = Pt(7)
110
- # Adding the centered zoned footer
111
- footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
112
- document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
113
- for keyword in keywordList:
114
-
115
- st.write("Results for Query: {}".format(keyword))
116
- para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
117
- para.font.size = Pt(12)
118
- bm25_hits, hits = search(keyword)
119
-
120
- st.markdown("""
121
- We will provide with 2 kind of results. The 'lexical search' and the semantic search.
122
- """)
123
- # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
124
- st.markdown("Top few lexical search (BM25) hits")
125
- document.add_paragraph("Top few lexical search (BM25) hits")
126
-
127
- for hit in bm25_hits[0:5]:
128
- if hit['score'] > 0.00:
129
- st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
130
- document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
131
-
132
-
133
-
134
- # st.table(bm25_hits[0:3])
135
-
136
- st.markdown("\n-------------------------\n")
137
- st.markdown("Top few Bi-Encoder Retrieval hits")
138
- document.add_paragraph("\n-------------------------\n")
139
- document.add_paragraph("Top few Bi-Encoder Retrieval hits")
140
-
141
- hits = sorted(hits, key=lambda x: x['score'], reverse=True)
142
- for hit in hits[0:5]:
143
- # if hit['score'] > 0.45:
144
- st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
145
- document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
udfPreprocess/uploadAndExample.py DELETED
@@ -1,48 +0,0 @@
1
- import streamlit as st
2
- import tempfile
3
- import udfPreprocess.docPreprocessing as pre
4
- import udfPreprocess.cleaning as clean
5
-
6
- def add_upload(choice):
7
-
8
-
9
- if choice == 'Upload Document':
10
- uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
11
- if uploaded_file is not None:
12
- with tempfile.NamedTemporaryFile(mode="wb") as temp:
13
- bytes_data = uploaded_file.getvalue()
14
- temp.write(bytes_data)
15
- st.session_state['filename'] = uploaded_file.name
16
- # st.write("Uploaded Filename: ", uploaded_file.name)
17
- file_name = uploaded_file.name
18
- file_path = temp.name
19
- docs = pre.load_document(file_path, file_name)
20
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
21
- st.session_state['docs'] = docs
22
- st.session_state['paraList'] = paraList
23
-
24
-
25
- else:
26
- # listing the options
27
- option = st.sidebar.selectbox('Select the example document',
28
- ('South Africa:Low Emission strategy',
29
- 'Ethiopia: 10 Year Development Plan'))
30
- if option is 'South Africa:Low Emission strategy':
31
- file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
32
- st.session_state['filename'] = file_name
33
- # st.write("Selected document:", file_name.split('/')[1])
34
- # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
35
- # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
36
- else:
37
- # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
38
- file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
39
- st.session_state['filename'] = file_name
40
- # st.write("Selected document:", file_name.split('/')[1])
41
-
42
- if option is not None:
43
- docs = pre.load_document(file_path,file_name)
44
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
45
- st.session_state['docs'] = docs
46
- st.session_state['paraList'] = paraList
47
-
48
-