Spaces:

xiomarablanco
/

plentas

Runtime error

File size: 12,658 Bytes

import json
import numpy as np
import hunspell
import nltk
import nltk.corpus 
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import ne_chunk
import re
import yake
import spacy
import os
import shutil 
#dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic')

nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es)
nltk.download('punkt')

# Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py)

#class char():
    #def __init__(self):
       # pass
    
class char_line():
    def __init__(self, word):
        self.word = word
        self.char_line = [(char, self.char_type(char)) for char in word]
        self.type_line = ''.join(chartype for char, chartype in self.char_line)
        
    def char_type(self, char):
        if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
            return 'V' #strong vowel
        if char in set(['i', 'u', 'ü']):
            return 'v' #week vowel
        if char=='x':
            return 'x'
        if char=='s':
            return 's'
        else:
            return 'c'
            
    def find(self, finder):
        return self.type_line.find(finder)
        
    def split(self, pos, where):
        return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])
    
    def split_by(self, finder, where):
        split_point = self.find(finder)
        if split_point!=-1:
            chl1, chl2 = self.split(split_point, where)
            return chl1, chl2
        return self, False
     
    def __str__(self):
        return self.word
    
    def __repr__(self):
        return repr(self.word)

class silabizer():
    def __init__(self):
        self.grammar = []
        
    def split(self, chars):
        rules  = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)]
        for split_rule, where in rules:
            first, second = chars.split_by(split_rule,where)
            if second:
                if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
                    #print 'skip1', first.word, second.word, split_rule, chars.type_line
                    continue
                if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
                    continue
                if first.word[-1]=='l' and second.word[-1]=='l':
                    continue
                if first.word[-1]=='r' and second.word[-1]=='r':
                    continue
                if first.word[-1]=='c' and second.word[-1]=='h':
                    continue
                return self.split(first)+self.split(second)
        return [chars]
        
    def __call__(self, word):
        return self.split(char_line(word))

# Contador número de frases y palabras empleadas en la respuesta
def check_senteces_words(student_answer):
    
    # Tokenizing into sentences
    sentences=[]
    words=[]
    letter_per_word=[]
    syll=0 # syllables counter
    
    TokenizeAnswer = sent_tokenize(student_answer)
    for token in TokenizeAnswer:
        regex = '\\.'
        token = re.sub(regex , '', token)
        sentences.append(token)
    for i in range(len(sentences)):
        word = sentences[i].split(' ') 
        for j in range(len(word)):
            words.append(word[j])
            syllables = silabizer()
            syll=syll+len(syllables(word[j]))
            letter_per_word.append(len(word[j]))

    sentencesLenght = len(sentences)
    wordsLenght = (len(words))
    #print(f'Number of senteces used in the answer: {sentencesLenght}')
    #print(f'Number of words used in the answer: {wordsLenght}')
    
    return sentencesLenght, wordsLenght, syll, letter_per_word

# Contador faltas de ortografía
def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ):

    dic = hunspell.Hunspell(hunspell_aff, hunspell_dic)
    errors=0
    words = student_answer.split(' ')
    wrong_words = []
    for word in words:
        for element in clean_words(word):            
            if not dic.spell(element):
                #print(f'Spelling mistake: {element}')
                wrong_words.append(element)
                errors+=1        
    #print(f'Spelling mistakes: {errors}')
    return errors,wrong_words
        
# Legibilidad de la respuesta en función del índice Fernández-Huerta
def FHuertas_index(sentencesLenght, wordsLenght, syll):    
    FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght) 
    FH = round(FH, 3)
    legibilidad_fh = ""
    #print(f'\nFernández-Huerta Index: {FH}')
    if 0 < FH <= 30:
        #print('Legibilidad FH: muy difícil.')
        legibilidad_fh = 'muy díficil'
    if 30 < FH <= 50:
        #print('Legibilidad FH: difícil.')  
        legibilidad_fh = 'díficil'
    if 50 < FH <= 60:
        #print('Legibilidad FH: ligeramente difícil.')
        legibilidad_fh = 'ligeramente díficil'
    if 60 < FH <= 70:
        #print('Legibilidad FH: adecuado.')
        legibilidad_fh = 'adecuado'
    if 70 < FH <= 80:
        #print('Legibilidad FH: ligeramente fácil.')
        legibilidad_fh = 'ligeramente fácil'
    if 80 < FH <= 90:
        #print('Legibilidad FH: fácil.')
        legibilidad_fh = 'fácil'
    if 90 < FH <= 100:
        #print('Legibilidad FH: muy fácil.')
        legibilidad_fh = 'muy fácil'
        
    return FH, legibilidad_fh
    
# Legibilidad de la respuesta en función del índice mu
def mu_index(sentencesLenght, wordsLenght, letter_per_word):
    med = np.mean(letter_per_word)
    var = np.var(letter_per_word)
    try:
        mu=(wordsLenght/(wordsLenght-1))*(med/var)*100
    except Exception as ex:
        print("Error on mu_index: " + str(ex))
        mu = 0

    mu=round(mu, 3)

    legibilidad_mu = ""
    #print(f'\nMu index: {mu}')
    if 0 < mu <= 30:
        #print('Legibilidad Mu: muy difícil.')
        legibilidad_mu = 'muy difícil'
    if 30 < mu <= 50:
        #print('Legibilidad Mu: difícil.')  
        legibilidad_mu = 'difícil'
    if 50 < mu <= 60:
        #print('Legibilidad Mu: ligeramente difícil.')
        legibilidad_mu = 'ligeramente difícil'
    if 60 < mu <= 70:
        #print('Legibilidad Mu: adecuado.')
        legibilidad_mu = 'adecuado'
    if 70 < mu <= 80:
        #print('Legibilidad Mu: ligeramente fácil.')
        legibilidad_mu = 'ligeramente fácil'
    if 80 < mu <= 90:
        #print('Legibilidad Mu: fácil.')
        legibilidad_mu = 'fácil'
    if 90 < mu <= 100:
        #print('Legibilidad Mu: muy fácil.')
        legibilidad_mu = 'muy fácil'
        
    return mu, legibilidad_mu

# Extractor de las kewords de un texto con librería yake
def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None):
    test_keywords=[]
    # Deleting special characters and set text in lower case
    regex = '\\\n'
    text = re.sub(regex , ' ', text)
    text = text.lower()    
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features )
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        test_keywords.append(kw[0])
    return test_keywords

# categorización de palabras
def word_categorization(student_answer):    
    fileDocument=[]
    TokenizeAnswer = sent_tokenize(student_answer)
    for token in TokenizeAnswer:
        fileDocument.append(token)
    sentencesLenght = len(fileDocument)
    sentence=0
    while sentence < sentencesLenght:      
        # Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...)
        word_tokens = word_tokenize(fileDocument[sentence])
        doc = nlp(fileDocument[sentence])
        pre_chunk = [(w.text, w.pos_) for w in doc]
        #print(pre_chunk)
        sentence += 1
        #pre_chunk = nltk.pos_tag(word_tokens)
        tree = ne_chunk(pre_chunk) # same tagging than before
        #grammer_np = ("NP: {<DT>?<JJ>*<NN>}")
        
        # Chunking rules to filter out:
        grammer_np = ("NP: {<DET>?<ADJ>*<NOUN>*<VERB>}")
        grammar = r"""

          NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and nouns

              {<NNP>+}                # chunk sequences of proper nouns

        """
        chunk_parser = nltk.RegexpParser(grammer_np)
        chunk_result = chunk_parser.parse(tree)

#..................................................................................................
def char_split(word, character):
    palabra1=""
    palabra2=""
    found = 0
    for w in word:
        if w == character and not found:
            found = 1
        else:
            if not found:
              palabra1 = palabra1 + w
            else:
              palabra2 = palabra2 + w

    return [palabra1, palabra2]

def clean_words(string):
    words_sentence = []
    for w in string:
      if not w.isalnum():
        if char_split(string, w)[0] != "":
            words_sentence.append(char_split(string, w)[0])
        string = char_split(string, w)[len(char_split(string, w))-1]

    if string != "":
        words_sentence.append(string)
    return words_sentence

def getNameFile(string):
    directories = string.split("/")
    return re.sub(".json","", directories[len(directories)-1])


def getIDrange(rango_ID, df):
    if rango_ID == "All":
        IDs = list(range(len(df['hashed_id'])))
    else:
        rango = []
        r= rango_ID.split(",")
        for i in r:
            c_w= clean_words(i)
            if len(c_w) == 2:
                rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1])))
            elif len(c_w) == 1:
                rango.append(int(c_w[0]) -1)
        IDs = rango

    return IDs

def save_json(path, data, isIndent = True):
    if isIndent:
        json_object = json.dumps(data, indent = 11, ensure_ascii= False)
    else:
        json_object = json.dumps(data, ensure_ascii= False)
    # Writing output to a json file
    with open(path, "w") as outfile:
        outfile.write(json_object)


def load_json(path):
    with open(path, "r", encoding="utf8") as f:
        data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
    
    return data

def load_json_dtset(path):
    with open(path, "r", encoding="latin-1") as f:
        data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
    
    return data

    
def splitResponse(respuesta_alumno_raw):
    #pre-processing the student's response
    regex = '\\\n'
    respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw)
    respuesta_alumno = respuesta_alumno.lower()

    #stacking each sentence of the student's response
    sentences=[]                        
    TokenizeAnswer = sent_tokenize(respuesta_alumno)
    for token in TokenizeAnswer:
        regex = '\\.'
        token = re.sub(regex , '', token)
        sentences.append(token)

    return sentences
        
def create_file_path(file, doctype):
    """

    This function is to create relative paths to store data.

    Inputs:

        file: the file or subpath + file where the info is to be stored

        doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents

    Outputs:

        path: the generated path

    """
    if doctype == 1:
        path = "api/" + file
    elif doctype == 2:
        path = "archivos/OutputFiles2/" + file
    elif doctype == 3:
        path = "archivos/Images/" + file
    else:
        path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file
    return path

def remove(path):
    """ param <path> could either be relative or absolute. """
    if os.path.isfile(path) or os.path.islink(path):
        os.remove(path)  # remove the file
    elif os.path.isdir(path):
        shutil.rmtree(path)  # remove dir and all contains
    else:
        raise ValueError("file {} is not a file or dir.".format(path))