Spaces:

furkanakkurt1618
/

tokenizer

Runtime error

File size: 3,108 Bytes

# -*- coding: utf-8 -*-


import pickle

char_to_ix = pickle.load(open("chardict.pickle", "rb"))
# print(char_to_ix)

tag_to_ix = {'N':0, 'B':1, 'I':2}

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
torch.manual_seed(1)

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        #self.batch_size = batch_size

        self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)  # <- change here

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, sentence):
        embeds = self.char_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

#Train the model

model = LSTMTagger(100, 256, len(char_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

model_save_name = 'classifier_bidirectional_emb100_hid256_epoch20.pt'

model.load_state_dict(torch.load(model_save_name))

def prepare_sequence(seq, to_ix):
  idxs = [to_ix[ch] for ch in seq]
  return torch.tensor(idxs, dtype = torch.long)

def prob_to_tag(out):
  _sentence_tag_list = []
  _prob_to_tag = []
  for ch in out:
      chlist = list(ch)
      maxi = max(chlist)
      ind = chlist.index(maxi)  
      _prob_to_tag.append((list(tag_to_ix.keys())[ind]))
  _sentence_tag_list.append(_prob_to_tag)
  return _sentence_tag_list

def _char_to_token(samplesent, _sentence_tag_list):
    token_list = []
    token = []
    for j in range(len(_sentence_tag_list[0])): #for each character of a sentence
        ch = _sentence_tag_list[0][j]
        ach = samplesent[j]

        if ch == 'I':
          token.append(ach)
          if j == len(_sentence_tag_list[0]) -1:
            token_list.append(token)

        else:
          if ch =='B':
            if j == 0:
              token.append(ach)
            else:
              token_list.append(token)
              token=[]
              token.append(ach) 
              if j ==  len(_sentence_tag_list[0]) -1:
                token_list.append(token)
          elif ch == 'N':
            continue    
    
    return token_list

def char_unifier(_token_list):
  for item in range(len(_token_list)):
    _token_list[item]= ''.join(_token_list[item])
  return _token_list

def tokenize(sentence):
  input = prepare_sequence(sentence, char_to_ix)
  out = model(input)
  sentence_tag_list = prob_to_tag(out)
  token_char_list = _char_to_token(sentence, sentence_tag_list)
  token_list = char_unifier(token_char_list)
  return token_list