import json import numpy as np import hunspell import nltk import nltk.corpus from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk import ne_chunk import re import yake import spacy #dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic') nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es) nltk.download('punkt') # Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py) #class char(): #def __init__(self): # pass class char_line(): def __init__(self, word): self.word = word self.char_line = [(char, self.char_type(char)) for char in word] self.type_line = ''.join(chartype for char, chartype in self.char_line) def char_type(self, char): if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']): return 'V' #strong vowel if char in set(['i', 'u', 'ü']): return 'v' #week vowel if char=='x': return 'x' if char=='s': return 's' else: return 'c' def find(self, finder): return self.type_line.find(finder) def split(self, pos, where): return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:]) def split_by(self, finder, where): split_point = self.find(finder) if split_point!=-1: chl1, chl2 = self.split(split_point, where) return chl1, chl2 return self, False def __str__(self): return self.word def __repr__(self): return repr(self.word) class silabizer(): def __init__(self): self.grammar = [] def split(self, chars): rules = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)] for split_rule, where in rules: first, second = chars.split_by(split_rule,where) if second: if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']): #print 'skip1', first.word, second.word, split_rule, chars.type_line continue if first.type_line[-1]=='c' and second.word[0] in set(['l','r']): continue if first.word[-1]=='l' and second.word[-1]=='l': continue if first.word[-1]=='r' and second.word[-1]=='r': continue if first.word[-1]=='c' and second.word[-1]=='h': continue return self.split(first)+self.split(second) return [chars] def __call__(self, word): return self.split(char_line(word)) # Contador número de frases y palabras empleadas en la respuesta def check_senteces_words(student_answer): # Tokenizing into sentences sentences=[] words=[] letter_per_word=[] syll=0 # syllables counter TokenizeAnswer = sent_tokenize(student_answer) for token in TokenizeAnswer: regex = '\\.' token = re.sub(regex , '', token) sentences.append(token) for i in range(len(sentences)): word = sentences[i].split(' ') for j in range(len(word)): words.append(word[j]) syllables = silabizer() syll=syll+len(syllables(word[j])) letter_per_word.append(len(word[j])) sentencesLenght = len(sentences) wordsLenght = (len(words)) #print(f'Number of senteces used in the answer: {sentencesLenght}') #print(f'Number of words used in the answer: {wordsLenght}') return sentencesLenght, wordsLenght, syll, letter_per_word # Contador faltas de ortografía def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ): dic = hunspell.Hunspell(hunspell_aff, hunspell_dic) errors=0 words = student_answer.split(' ') wrong_words = [] for word in words: for element in clean_words(word): if not dic.spell(element): #print(f'Spelling mistake: {element}') wrong_words.append(element) errors+=1 #print(f'Spelling mistakes: {errors}') return errors,wrong_words # Legibilidad de la respuesta en función del índice Fernández-Huerta def FHuertas_index(sentencesLenght, wordsLenght, syll): FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght) FH = round(FH, 3) legibilidad_fh = "" #print(f'\nFernández-Huerta Index: {FH}') if 0 < FH <= 30: #print('Legibilidad FH: muy difícil.') legibilidad_fh = 'muy díficil' if 30 < FH <= 50: #print('Legibilidad FH: difícil.') legibilidad_fh = 'díficil' if 50 < FH <= 60: #print('Legibilidad FH: ligeramente difícil.') legibilidad_fh = 'ligeramente díficil' if 60 < FH <= 70: #print('Legibilidad FH: adecuado.') legibilidad_fh = 'adecuado' if 70 < FH <= 80: #print('Legibilidad FH: ligeramente fácil.') legibilidad_fh = 'ligeramente fácil' if 80 < FH <= 90: #print('Legibilidad FH: fácil.') legibilidad_fh = 'fácil' if 90 < FH <= 100: #print('Legibilidad FH: muy fácil.') legibilidad_fh = 'muy fácil' return FH, legibilidad_fh # Legibilidad de la respuesta en función del índice mu def mu_index(sentencesLenght, wordsLenght, letter_per_word): med = np.mean(letter_per_word) var = np.var(letter_per_word) mu=(wordsLenght/(wordsLenght-1))*(med/var)*100 mu=round(mu, 3) legibilidad_mu = "" #print(f'\nMu index: {mu}') if 0 < mu <= 30: #print('Legibilidad Mu: muy difícil.') legibilidad_mu = 'muy difícil' if 30 < mu <= 50: #print('Legibilidad Mu: difícil.') legibilidad_mu = 'difícil' if 50 < mu <= 60: #print('Legibilidad Mu: ligeramente difícil.') legibilidad_mu = 'ligeramente difícil' if 60 < mu <= 70: #print('Legibilidad Mu: adecuado.') legibilidad_mu = 'adecuado' if 70 < mu <= 80: #print('Legibilidad Mu: ligeramente fácil.') legibilidad_mu = 'ligeramente fácil' if 80 < mu <= 90: #print('Legibilidad Mu: fácil.') legibilidad_mu = 'fácil' if 90 < mu <= 100: #print('Legibilidad Mu: muy fácil.') legibilidad_mu = 'muy fácil' return mu, legibilidad_mu # Extractor de las kewords de un texto con librería yake def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None): test_keywords=[] # Deleting special characters and set text in lower case regex = '\\\n' text = re.sub(regex , ' ', text) text = text.lower() custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features ) keywords = custom_kw_extractor.extract_keywords(text) for kw in keywords: test_keywords.append(kw[0]) return test_keywords # categorización de palabras def word_categorization(student_answer): fileDocument=[] TokenizeAnswer = sent_tokenize(student_answer) for token in TokenizeAnswer: fileDocument.append(token) sentencesLenght = len(fileDocument) sentence=0 while sentence < sentencesLenght: # Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...) word_tokens = word_tokenize(fileDocument[sentence]) doc = nlp(fileDocument[sentence]) pre_chunk = [(w.text, w.pos_) for w in doc] #print(pre_chunk) sentence += 1 #pre_chunk = nltk.pos_tag(word_tokens) tree = ne_chunk(pre_chunk) # same tagging than before #grammer_np = ("NP: {
?*}") # Chunking rules to filter out: grammer_np = ("NP: {?**}") grammar = r""" NP: {?*} # chunk determiner/possessive, adjectives and nouns {+} # chunk sequences of proper nouns """ chunk_parser = nltk.RegexpParser(grammer_np) chunk_result = chunk_parser.parse(tree) #.................................................................................................. def char_split(word, character): palabra1="" palabra2="" found = 0 for w in word: if w == character and not found: found = 1 else: if not found: palabra1 = palabra1 + w else: palabra2 = palabra2 + w return [palabra1, palabra2] def clean_words(string): words_sentence = [] for w in string: if not w.isalnum(): if char_split(string, w)[0] != "": words_sentence.append(char_split(string, w)[0]) string = char_split(string, w)[len(char_split(string, w))-1] if string != "": words_sentence.append(string) return words_sentence def getNameFile(string): directories = string.split("/") return re.sub(".json","", directories[len(directories)-1]) def getIDrange(rango_ID, df): if rango_ID == "All": IDs = list(range(len(df['hashed_id']))) else: rango = [] r= rango_ID.split(",") for i in r: c_w= clean_words(i) if len(c_w) == 2: rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1]))) elif len(c_w) == 1: rango.append(int(c_w[0]) -1) IDs = rango return IDs def save_json(path, data, isIndent = True): if isIndent: json_object = json.dumps(data, indent = 11, ensure_ascii= False) else: json_object = json.dumps(data, ensure_ascii= False) # Writing output to a json file with open(path, "w") as outfile: outfile.write(json_object) def load_json(path): with open(path, "r", encoding="utf8") as f: data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]") return data def load_json_dtset(path): with open(path, "r", encoding="latin-1") as f: data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]") return data def splitResponse(respuesta_alumno_raw): #pre-processing the student's response regex = '\\\n' respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw) respuesta_alumno = respuesta_alumno.lower() #stacking each sentence of the student's response sentences=[] TokenizeAnswer = sent_tokenize(respuesta_alumno) for token in TokenizeAnswer: regex = '\\.' token = re.sub(regex , '', token) sentences.append(token) return sentences def create_file_path(file, doctype): """ This function is to create relative paths to store data. Inputs: file: the file or subpath + file where the info is to be stored doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents Outputs: path: the generated path """ if doctype == 1: path = "api/" + file elif doctype == 2: path = "archivos/OutputFiles2/" + file elif doctype == 3: path = "archivos/Images/" + file else: path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file return path