Spaces:
Runtime error
Runtime error
File size: 4,490 Bytes
a87bc00 2e83a41 a87bc00 2e83a41 a87bc00 8b674fc a87bc00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# -*- coding: utf-8 -*-
"""
# MANIFESTO ANALYSIS
## IMPORTING LIBRARIES
"""
# Commented out IPython magic to ensure Python compatibility.
# %%capture
# !pip install tika
# !pip install clean-text
# !pip install gradio
# Commented out IPython magic to ensure Python compatibility.
import io
import random
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import tika
from tika import parser
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from cleantext import clean
import nltk.corpus
from nltk.text import Text
from io import StringIO
import sys
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from textblob import TextBlob
from PIL import Image
import gradio as gr
from zipfile import ZipFile
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
"""## PARSING FILES"""
def Parsing(parsed_text):
parsed_text=parsed_text.name
raw_party =parser.from_file(parsed_text)
# parser.parse1(option='all',urlOrPath=parsed_text)
# from_buffer(parsed_text)
# from_file(parsed_text)
raw_party = raw_party['content']
return clean(raw_party)
#Added more stopwords to avoid irrelevant terms
stop_words = set(stopwords.words('english'))
stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
"""## PREPROCESSING"""
def clean_text(text):
'''
Function which returns clean text
'''
text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
text = re.sub(r"\n", " ", text)
text = re.sub(r"\n\n", " ", text)
text = re.sub(r"\t", " ", text)
text = re.sub(r"/ ", " ", text)
text = text.strip(" ")
text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
text = [word for word in text.split() if word not in STOPWORDS]
text = ' '.join(text)
return text
# text_Party=clean_text(raw_party)
def Preprocess(textParty):
'''
Removing special characters extra spaces
'''
text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
#Removing all stop words
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
text2Party = pattern.sub('', text1Party)
# fdist_cong = FreqDist(word_tokens_cong)
return text2Party
# Using Concordance,you can see each time a word is used, along with its
# immediate context. It can give you a peek into how a word is being used
# at the sentence level and what words are used with it.
def concordance(text_Party,strng):
word_tokens_party = word_tokenize(text_Party)
moby = Text(word_tokens_party)
resultList = []
for i in range(0,1):
save_stdout = sys.stdout
result = StringIO()
sys.stdout = result
moby.concordance(strng,lines=10,width=82)
sys.stdout = save_stdout
s=result.getvalue().splitlines()
return result.getvalue()
def normalize(d, target=1.0):
raw = sum(d.values())
factor = target/raw
return {key:value*factor for key,value in d.items()}
def fDistance(text2Party):
'''
most frequent words search
'''
word_tokens_party = word_tokenize(text2Party) #Tokenizing
fdistance = FreqDist(word_tokens_party).most_common(10)
mem={}
for x in fdistance:
mem[x[0]]=x[1]
return normalize(mem)
def fDistancePlot(text2Party,plotN=20):
'''
most frequent words visualisation
'''
word_tokens_party = word_tokenize(text2Party) #Tokenizing
fdistance = FreqDist(word_tokens_party)
return fdistance.plot(20)
## UI INTERFACE
def analysis(Manifesto,Search):
raw_party = Parsing(Manifesto)
text_Party=clean_text(raw_party)
text_Party= Preprocess(text_Party)
fdist_Party=fDistance(text_Party)
searchRes=concordance(text_Party,Search)
searChRes=clean(searchRes)
# searChRes=searchRes.replace(Search,f"\u0332{Search}\u0332 ")
searChRes=searchRes.replace(Search,"\u0332".join(Search))
return fdist_Party,searChRes
Search_txt=gr.inputs.Textbox()
filePdf = gr.inputs.File()
text = gr.outputs.Textbox(label='SEARCHED OUTPUT')
mfw=gr.outputs.Label(label="Most Relevant topics in manifesto")
gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[mfw,text], title='Manifesto Analysis').launch(debug=False,share=True)
|