#import all the required libraries. import pandas as pd import numpy as np import re import seaborn as sns import matplotlib.pyplot as plt import warnings import csv warnings.simplefilter("ignore") import sklearn # Loading the dataset data = pd.read_csv("Language_Detection.csv") # value count for each language data["Language"].value_counts() # separating the independent and dependant features X = data["Text"] y = data["Language"] # converting categorical variables to numerical from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) # creating a list for appending the preprocessed text data_list = [] # iterating through all the text for text in X: # removing the symbols and numbers text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text) text = re.sub(r'[[]]', ' ', text) # converting the text to lower case text = text.lower() # appending to data_list data_list.append(text) # creating bag of words using countvectorizer from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(data_list).toarray() X.shape #train test splitting from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) #model creation and prediction from sklearn.naive_bayes import MultinomialNB model = MultinomialNB() model.fit(x_train, y_train) # prediction y_pred = model.predict(x_test) # model evaluation from sklearn.metrics import accuracy_score, confusion_matrix, classification_report ac = accuracy_score(y_test, y_pred) cm = confusion_matrix(y_test, y_pred) print("Accuracy is :",ac) # visualising the confusion matrix plt.figure(figsize=(15,10)) sns.heatmap(cm, annot = True) plt.show() # function for predicting language def predict(text): x = cv.transform([text]).toarray() lang = model.predict(x) lang = le.inverse_transform(lang) print("The langauge is in",lang[0]) # English #prediction("load the texts") #with open('processed/text_detected/language_detected.txt', 'w', newline="") as file: # csv.writer(file, delimiter=" ").writerows(prediction)