diff --git a/ngram.py b/ngram.py new file mode 100644 index 0000000000000000000000000000000000000000..bbd0a61b42b877c35af02a3e892164dfd074259d --- /dev/null +++ b/ngram.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- + +from nltk.corpus import stopwords +from sklearn import metrics +from sklearn import model_selection +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.multiclass import OneVsRestClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline +from sklearn.tree import DecisionTreeClassifier +import spacy +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import string + +# Suppression de l'affichage des messages d'avertissement +import warnings +warnings.filterwarnings('ignore') +# Pour éviter l'affichage tronqué des descriptions +pd.set_option('display.max_colwidth', -1) + +def correct_file(): + df = pd.read_csv("C:\\Users\\18664\\Desktop\\projet tutore\\apprentissage-master\\export_dataframe.csv", sep="\t") + + ''' + correct the wrong columns + ''' + a = list(df.columns) + b = a[1:5] + c = a[5:] + d = c + b + d.insert(0,a[0]) + df.columns = d + for i in range(0,4): + last_col = df.pop(df.columns[-1]) + df.insert(1, last_col.name, last_col) + df.to_csv(r'C:\Users\18664\Desktop\projet tutore\export_dataframe1.csv', sep='\t' ) + +def import_file(file='C:\\Users\\18664\\Desktop\\projet tutore\\export_dataframe1.csv'): + df = pd.read_csv(file, sep=",") + df = df.dropna(how = 'any') + return df + +def divide_dataset_pos(df): + X = df.iloc[:,3:] + y_pos = df['P_position'] + return train_test_split(X, y_pos, test_size=0.4) + +def divide_dataset_thm(df): + X = df.iloc[:,3:] + y_theme = df['T_theme'] + return train_test_split(X, y_theme, test_size=0.4) + +def divide_dataset_pos_ngrams(df): + X = df['Declaration'] + y_pos = df['Position'] + return train_test_split(X, y_pos, test_size=0.4) + +def divide_dataset_thm_ngrams(df): + X = df['Declaration'] + y_pos = df['Thematique'] + return train_test_split(X, y_pos, test_size=0.4) + +def predict(method): + description = str(method) + + classifier_pipeline = make_pipeline(method) + + classifier_pipeline.fit(Xp_train, yp_train) + pos_predictions = classifier_pipeline.predict(Xp_test) + + classifier_pipeline.fit(Xt_train, yt_train) + thm_predictions = classifier_pipeline.predict(Xt_test) + + print('accuracy de position', accuracy_score(yp_test, pos_predictions)) + labels_pos = np.unique(yp_test) + cm_pos = confusion_matrix(yp_test, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + print(f'confusion matrix de {description}\n', confusion_df_pos) + print('(row=expected, col=predicted)') + print(classification_report(yp_test, pos_predictions, target_names=labels_pos)) + + print('accuracy de thème', accuracy_score(yt_test,thm_predictions)) + labels_thm = np.unique(yt_test) + cm_thm = confusion_matrix(yt_test, thm_predictions, labels=labels_thm) + confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm) + print(f'confusion matrix de {description}\n', confusion_df_thm) + print('(row=expected, col=predicted)') + print(classification_report(yt_test, thm_predictions, target_names=labels_thm)) + +def predict_ngrams(method,i=2): + word_vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,i), + analyzer='word', + max_features=1000, + binary = True) + + description = str(method) + + classifier_pipeline = make_pipeline(word_vectorizer,method) + + classifier_pipeline.fit(Xpn_train, ypn_train) + pos_predictions = classifier_pipeline.predict(Xpn_test) + + classifier_pipeline.fit(Xtn_train, ytn_train) + thm_predictions = classifier_pipeline.predict(Xtn_test) + + print('accuracy de position', accuracy_score(ypn_test, pos_predictions)) + labels_pos = np.unique(ypn_test) + cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + print(f'confusion matrix de {description}\n', confusion_df_pos) + print('(row=expected, col=predicted)') + print(classification_report(ypn_test, pos_predictions, target_names=labels_pos)) + + print('accuracy de thème', accuracy_score(ytn_test,thm_predictions)) + labels_thm = np.unique(ytn_test) + cm_thm = confusion_matrix(ytn_test, thm_predictions, labels=labels_thm) + confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm) + print(f'confusion matrix de {description}\n', confusion_df_thm) + print('(row=expected, col=predicted)') + print(classification_report(ytn_test, thm_predictions, target_names=labels_thm)) + +if __name__ == '__main__': + df = import_file() + df_ngrams = import_file('C:\\Users\\18664\\Desktop\\projet tutore\\1.csv') + Xp_train, Xp_test, yp_train, yp_test = divide_dataset_pos(df) + Xt_train, Xt_test, yt_train, yt_test = divide_dataset_thm(df) + Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams) + Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams) + #predict(LogisticRegression()) + #predict_ngrams(LogisticRegression(),2) + #predict_ngrams(LogisticRegression(),3) + #predict(RandomForestClassifier()) + #predict_ngrams(RandomForestClassifier(),2) + #predict_ngrams(RandomForestClassifier(),3) + #predict(MultinomialNB()) + #predict_ngrams(MultinomialNB(),2) + #predict_ngrams(MultinomialNB(),3) \ No newline at end of file diff --git a/ngramsGUI.py b/ngramsGUI.py new file mode 100644 index 0000000000000000000000000000000000000000..65357d3c694535b6652c13c4b09ff8e755cc1eac --- /dev/null +++ b/ngramsGUI.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- + +import tkinter as tk +import glob +from sklearn.ensemble import RandomForestClassifier +from sklearn.dummy import DummyClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report, accuracy_score, confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import make_pipeline +import numpy as np +import pandas as pd + +# Suppression de l'affichage des messages d'avertissement +import warnings +warnings.filterwarnings('ignore') +# Pour éviter l'affichage tronqué des descriptions +pd.set_option('display.max_colwidth', -1) + +def pretreatment(file): + path = file + all_files = glob.glob(path + "/*.csv") + li = [] + + for filename in all_files: + df = pd.read_csv(filename, sep="\t", index_col=None, header=0,error_bad_lines=False) + li.append(df) + + ## Fusionner les fichiers csv dans un seul dataframe ## + df_ngrams = pd.concat(li, axis=0, ignore_index=True) + df_ngrams = df_ngrams.drop_duplicates(subset=['Declaration']) + df_ngrams = df_ngrams.drop(df_ngrams[df_ngrams['Position'] == ' Pas de prise de position référencée pour les animaux '].index) + df_ngrams.index = range(len(df_ngrams)) + position = df_ngrams['Position'] + position = position.replace([' a agi pour les animaux ',' ont agi pour les animaux '],'agi pour') + position = position.replace([' a agi contre les animaux ',' ont agi contre les animaux '],'agi contre') + position = position.replace([' a penché pour les animaux ',' ont penché pour les animaux '],'penché pour') + position = position.replace([' a penché contre les animaux ',' ont penché contre les animaux '],'penché contre') + df_ngrams['Position'] = position + return df_ngrams + +def import_file(file): + df = pd.read_csv(file, sep=",") + df = df.dropna(how = 'any') + return df + +def divide_dataset_pos(df): + X = df.iloc[:,3:] + y_pos = df['P_position'] + return train_test_split(X, y_pos, test_size=0.2) + +def divide_dataset_thm(df): + X = df.iloc[:,3:] + y_theme = df['T_theme'] + return train_test_split(X, y_theme, test_size=0.2) + +def divide_dataset_pos_ngrams(df): + X = df['Declaration'] + y_pos = df['Position'] + return train_test_split(X, y_pos, test_size=0.2) + +def divide_dataset_thm_ngrams(df): + X = df['Declaration'] + y_pos = df['Thematique'] + return train_test_split(X, y_pos, test_size=0.2) + +def predict(): + ''' + choose method + ''' + method = methods.get() + if method == 'LogisticRegression': + classifier_pipeline = make_pipeline(LogisticRegression()) + elif method == 'RandomForestClassifier': + classifier_pipeline = make_pipeline(RandomForestClassifier()) + elif method == 'MultinomialNB': + classifier_pipeline = make_pipeline(MultinomialNB()) + + ''' + machine learning code + ''' + classifier_pipeline.fit(Xp_train, yp_train) + pos_predictions = classifier_pipeline.predict(Xp_test) + + classifier_pipeline.fit(Xt_train, yt_train) + thm_predictions = classifier_pipeline.predict(Xt_test) + + t.insert('end',f"accuracy de position: {accuracy_score(yp_test, pos_predictions)}\n") + labels_pos = np.unique(yp_test) + cm_pos = confusion_matrix(yp_test, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + t.insert('end',f"confusion matrix de {method}\n{confusion_df_pos}\n") + t.insert('end',f"(row=expected, col=predicted)\n") + t.insert('end',f"{classification_report(yp_test, pos_predictions, target_names=labels_pos)}\n") + + t.insert('end',f"accuracy de thème: {accuracy_score(yt_test,thm_predictions)}\n") + labels_thm = np.unique(yt_test) + cm_thm = confusion_matrix(yt_test, thm_predictions, labels=labels_thm) + confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm) + t.insert('end',f"confusion matrix de {method}\n{confusion_df_thm}\n") + t.insert('end',f"(row=expected, col=predicted)\n") + t.insert('end',f"{classification_report(yt_test, thm_predictions, target_names=labels_thm)}\n") + +def predict_ngrams(): + ''' + initialize word vectorizer + ''' + i = num.get() + word_vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,i), + analyzer='word', + max_features=5000, + binary=False) + + ''' + choose methode + ''' + method = methods.get() + if method == 'LogisticRegression': + classifier_pipeline = make_pipeline(word_vectorizer,LogisticRegression()) + elif method == 'RandomForestClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,RandomForestClassifier()) + elif method == 'MultinomialNB': + classifier_pipeline = make_pipeline(word_vectorizer,MultinomialNB()) + elif method == 'DummyClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,DummyClassifier()) + elif method == 'KNeighborsClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,KNeighborsClassifier()) + elif method == 'DecisionTreeClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,DecisionTreeClassifier()) + + ''' + machine learning code + ''' + file = var_file1.get() + df_ngrams = pretreatment(file) + Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams) + Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams) + classifier_pipeline.fit(Xpn_train, ypn_train) + pos_predictions = classifier_pipeline.predict(Xpn_test) + classifier_pipeline.fit(Xtn_train, ytn_train) + thm_predictions = classifier_pipeline.predict(Xtn_test) + + t.insert('end',f"accuracy de position: {accuracy_score(ypn_test, pos_predictions)}\n") + labels_pos = np.unique(ypn_test) + cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + t.insert('end',f"confusion matrix de {method}, ngrams:{i}\n{confusion_df_pos}\n") + t.insert('end',f"(row=expected, col=predicted)\n") + t.insert('end',f"{classification_report(ypn_test, pos_predictions, target_names=labels_pos)}\n") + + t.insert('end',f"accuracy de thème: {accuracy_score(ytn_test,thm_predictions)}\n") + labels_thm = np.unique(ytn_test) + cm_thm = confusion_matrix(ytn_test, thm_predictions, labels=labels_thm) + confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm) + t.insert('end',f"confusion matrix de {method}, ngrams:{i}\n{confusion_df_thm}\n") + t.insert('end',f"(row=expected, col=predicted)\n") + t.insert('end',f"{classification_report(ytn_test, thm_predictions, target_names=labels_thm)}\n") + + #a = make_pipeline(word_vectorizer) + #res = a.transform(Xpn_test) + #bow = pd.DataFrame(res.toarray(), columns=word_vectorizer.get_feature_names()) + #print(bow) + +def execute(): + #file1 = var_file1.get() + #file2 = var_file2.get() + #df = import_file(file1) + df_ngrams = pretreatment() + #Xp_train, Xp_test, yp_train, yp_test = divide_dataset_pos(df) + #Xt_train, Xt_test, yt_train, yt_test = divide_dataset_thm(df) + Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams) + Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams) + t.insert('end',"done\n") + +''' +GUI code +''' +window = tk.Tk() +window.title('ngram') +window.geometry('1280x720') + +num_ngrams = (1,2,3) + +methods = tk.StringVar() +m1 = tk.Radiobutton(window, text='LogisticRegression', variable=methods, value='LogisticRegression').place(x=50,y=460) +m2 = tk.Radiobutton(window, text='RandomForestClassifier', variable=methods, value='RandomForestClassifier').place(x=50,y=510) +m3 = tk.Radiobutton(window, text='MultinomialNB', variable=methods, value='MultinomialNB').place(x=50,y=560) +m4 = tk.Radiobutton(window, text='KNeighborsClassifier', variable=methods, value='KNeighborsClassifier').place(x=50,y=610) +m5 = tk.Radiobutton(window, text='DecisionTreeClassifier', variable=methods, value='DecisionTreeClassifier').place(x=50,y=660) +m6 = tk.Radiobutton(window, text='DummyClassifier', variable=methods, value='DummyClassifier').place(x=50,y=710) + +num = tk.IntVar() +n1 = tk.Radiobutton(window, text='1', variable=num, value=1).place(x=50,y=810) +n2 = tk.Radiobutton(window, text='2', variable=num, value=2).place(x=50,y=860) +n3 = tk.Radiobutton(window, text='3', variable=num, value=3).place(x=50,y=910) + +#tk.Button(window,text='Pretreat',font=('calibri',12),width=15,height=1,command=execute).place(x=200,y=110) +#tk.Button(window,text='Predict with bag of words',font=('calibri',12),width=30,height=1,command=predict).place(x=50,y=215) +tk.Button(window,text='Predict with ngrams',font=('calibri',12),width=30,height=1,command=predict_ngrams).place(x=50,y=315) + +tk.Label(window,text='data files :').place(x=10,y=10) +#tk.Label(window,text='csv file(ngrams) :').place(x=10,y=60) +#tk.Label(window,text='Pretreatment : ').place(x=10,y=115) +tk.Label(window,text='Methods : ').place(x=10,y=415) +tk.Label(window,text='Ngrams number : ').place(x=10,y=765) + +var_file1 = tk.StringVar() +entry_file1 = tk.Entry(window,textvariable=var_file1) +entry_file1.place(x=200,y=10) + +#var_file2 = tk.StringVar() +#entry_file2 = tk.Entry(window,textvariable=var_file2) +#entry_file2.place(x=200,y=60) + +t = tk.Text(window, height=50,width=165) +t.place(x=400,y=10) + +window.mainloop() \ No newline at end of file diff --git a/ngramsGUI_UltraSuperInvincible.py b/ngramsGUI_UltraSuperInvincible.py new file mode 100644 index 0000000000000000000000000000000000000000..e2de94bacc087e9b769b3510dfa85b30c524aab7 --- /dev/null +++ b/ngramsGUI_UltraSuperInvincible.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- + +import tkinter as tk +import glob +from sklearn.ensemble import RandomForestClassifier +from sklearn.dummy import DummyClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn import model_selection +from sklearn.metrics import classification_report, accuracy_score, confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import make_pipeline +from sklearn.feature_extraction.text import TfidfVectorizer +from nltk.corpus import stopwords +import numpy as np +import pandas as pd +import string + +# Suppression de l'affichage des messages d'avertissement +import warnings +warnings.filterwarnings('ignore') +# Pour éviter l'affichage tronqué des descriptions +pd.set_option('display.max_colwidth', -1) + +def print_csvPath(): + value = tk.filedialog.askdirectory() + var_file1.set(value) + +def pretreatment(file): + path = file + all_files = glob.glob(path + "/*.csv") + li = [] + + for filename in all_files: + df = pd.read_csv(filename, sep="\t", index_col=None, header=0,error_bad_lines=False) + li.append(df) + + ## Fusionner les fichiers csv dans un seul dataframe ## + df_ngrams = pd.concat(li, axis=0, ignore_index=True) + df_ngrams = df_ngrams.drop_duplicates(subset=['Declaration']) + df_ngrams = df_ngrams.drop(df_ngrams[df_ngrams['Position'] == ' Pas de prise de position référencée pour les animaux '].index) + df_ngrams.index = range(len(df_ngrams)) + position = df_ngrams['Position'] + df_ngrams.loc[position.str.contains('agi pour'), 'Position'] = 'agi pour' + df_ngrams.loc[position.str.contains('agi contre'), 'Position'] = 'agi contre' + df_ngrams.loc[position.str.contains('penché pour'), 'Position'] = 'penché pour' + df_ngrams.loc[position.str.contains('penché contre'), 'Position'] = 'penché contre' + df_ngrams['Position'] = position + return df_ngrams + + +def import_file(file): + df = pd.read_csv(file, sep=",") + df = df.dropna(how = 'any') + return df + +def divide_dataset_pos_ngrams(df): + X = df['Declaration'] + y_pos = df['Position'] + return train_test_split(X, y_pos, test_size=0.2) + +def divide_dataset_thm_ngrams(df): + X = df['Declaration'] + y_thm = df['Thematique'] + return train_test_split(X, y_thm, test_size=0.2) + +def predict_ngrams(): + t.delete("1.0","end") + + ''' + initialize word vectorizer + ''' + mini = num1.get() + maxi = num2.get() + + nltk_stopwords = stopwords.words('french') + list(string.punctuation) + + count_vectorizer = CountVectorizer(lowercase=True, + ngram_range=(mini,maxi), + stop_words=nltk_stopwords, + max_features=5000) + + + tfidf_vectorizer = TfidfVectorizer(lowercase=True, + ngram_range=(mini,maxi), + stop_words=nltk_stopwords, + min_df=0.01, + max_features=5000) + + ''' + choose methode + ''' + vectorizer = vectorizers.get() + if vectorizer == 'CountVectorizer': + word_vectorizer = count_vectorizer + elif vectorizer == 'TfidfVectorizer': + word_vectorizer = tfidf_vectorizer + + method = methods.get() + if method == 'LogisticRegression': + classifier_pipeline = make_pipeline(word_vectorizer,LogisticRegression()) + elif method == 'RandomForestClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,RandomForestClassifier()) + elif method == 'MultinomialNB': + classifier_pipeline = make_pipeline(word_vectorizer,MultinomialNB()) + elif method == 'DummyClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,DummyClassifier()) + elif method == 'KNeighborsClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,KNeighborsClassifier()) + elif method == 'DecisionTreeClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,DecisionTreeClassifier()) + + ''' + machine learning code + ''' + file = var_file1.get() + df_ngrams = pretreatment(file) + cv = cross_validation.get() + + if cv == 'No': + Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams) + Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams) + + classifier_pipeline.fit(Xpn_train, ypn_train) + pos_predictions = classifier_pipeline.predict(Xpn_test) + classifier_pipeline.fit(Xtn_train, ytn_train) + thm_predictions = classifier_pipeline.predict(Xtn_test) + + t.insert('end',f"position accuracy : {accuracy_score(ypn_test, pos_predictions)}\n") + labels_pos = np.unique(ypn_test) + cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{confusion_df_pos}\n") + t.insert('end',f"(row=expected, col=predicted)\n\n") + t.insert('end',f"{classification_report(ypn_test, pos_predictions, target_names=labels_pos)}\n") + t.insert('end',"-------------------------------------------------\n") + t.insert('end',f"theme accuracy : {accuracy_score(ytn_test,thm_predictions)}\n") + labels_thm = np.unique(ytn_test) + t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{classification_report(ytn_test, thm_predictions, target_names=labels_thm)}\n") + else: + kfold = model_selection.KFold(n_splits=2, shuffle=True) + + X = df_ngrams['Declaration'] + y_pos = df_ngrams['Position'] + y_thm = df_ngrams['Thematique'] + + pos_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_pos, + cv=kfold) + thm_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_thm, + cv=kfold) + + t.insert('end',f"position accuracy : {accuracy_score(y_pos, pos_predictions)}\n") + labels_pos = np.unique(y_pos) + cm_pos = confusion_matrix(y_pos, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{confusion_df_pos}\n") + t.insert('end',f"(row=expected, col=predicted)\n\n") + t.insert('end',f"{classification_report(y_pos, pos_predictions, target_names=labels_pos)}\n") + t.insert('end',"-------------------------------------------------\n") + t.insert('end',f"theme accuracy : {accuracy_score(y_thm,thm_predictions)}\n") + labels_thm = np.unique(y_thm) + t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{classification_report(y_thm, thm_predictions, target_names=labels_thm)}\n") + +''' +GUI code +''' +window = tk.Tk() +window.title('N-grams') +window.geometry('1280x720') + +methods = tk.StringVar() +m1 = tk.Radiobutton(window, text='LogisticRegression', variable=methods, value='LogisticRegression').place(x=50,y=130) +m2 = tk.Radiobutton(window, text='RandomForestClassifier', variable=methods, value='RandomForestClassifier').place(x=50,y=160) +m3 = tk.Radiobutton(window, text='MultinomialNB', variable=methods, value='MultinomialNB').place(x=50,y=190) +m4 = tk.Radiobutton(window, text='KNeighborsClassifier', variable=methods, value='KNeighborsClassifier').place(x=50,y=220) +m5 = tk.Radiobutton(window, text='DecisionTreeClassifier', variable=methods, value='DecisionTreeClassifier').place(x=50,y=250) +m6 = tk.Radiobutton(window, text='DummyClassifier', variable=methods, value='DummyClassifier').place(x=50,y=280) +methods.set('LogisticRegression') + +vectorizers = tk.StringVar() +v1 = tk.Radiobutton(window, text='CountVectorizer', variable=vectorizers, value='CountVectorizer').place(x=50,y=340) +v1 = tk.Radiobutton(window, text='TfidfVectorizer', variable=vectorizers, value='TfidfVectorizer').place(x=50,y=370) +vectorizers.set('CountVectorizer') + +cross_validation = tk.StringVar() +c1 = tk.Radiobutton(window, text='Yes', variable=cross_validation, value='Yes').place(x=50,y=490) +c2 = tk.Radiobutton(window, text='No', variable=cross_validation, value='No').place(x=150,y=490) +cross_validation.set('Yes') + +num1 = tk.IntVar(value=1) +n1 = tk.Entry(window,textvariable=num1,width=1).place(x=50,y=430) +num2 = tk.IntVar(value=3) +n2 = tk.Entry(window,textvariable=num2,width=1).place(x=100,y=430) + +tk.Button(window, text='Open', width=8, height=1, command=print_csvPath).place(x=160,y=50) +tk.Button(window,text='Predict with N-grams',font=('calibri',12),width=30,height=1,command=predict_ngrams).place(x=45,y=550) + +tk.Label(window,text='CSVs directory : ').place(x=10,y=10) +tk.Label(window,text='Methods : ').place(x=10,y=100) +tk.Label(window,text='Vectorizers : ').place(x=10,y=310) +tk.Label(window,text='N-grams range : ').place(x=10,y=400) +tk.Label(window,text='to').place(x=70,y=430) +tk.Label(window,text='Cross validation : ').place(x=10,y=460) + +var_file1 = tk.StringVar() +tk.Entry(window,textvariable=var_file1,width=25).place(x=135,y=10) + +t = tk.Text(window, height=41,width=96) +t.place(x=400,y=10) + +window.mainloop() \ No newline at end of file diff --git a/ngramsGUI_UltraSuperInvincibleMega.py b/ngramsGUI_UltraSuperInvincibleMega.py new file mode 100644 index 0000000000000000000000000000000000000000..1a46a047dcd446b21898d95dc9456f5ba8842f81 --- /dev/null +++ b/ngramsGUI_UltraSuperInvincibleMega.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- +''' +Auteur : CHEN Tinghan + +Version : 1.00 + +Ce script contient des fonctionnalités telles que la lecture de fichiers csv +bruts pour un prétraitement simple, l'apprentissage automatique à l'aide des +paramètres tels que les n-grammes, et une interface graphique. +Les fichiers d'entrée sont des fichiers csv multiples dans le même dossier dans +un format uniforme avec quatre colonnes : index, position, thématique et +déclaration. +La sortie sera effectuée dans une interface graphique, le calcul prendra de +30 secondes à 1 minute. +''' +import tkinter as tk +import glob +from sklearn.ensemble import RandomForestClassifier +from sklearn.dummy import DummyClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn import model_selection +from sklearn.metrics import classification_report, accuracy_score, confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import make_pipeline +from sklearn.feature_extraction.text import TfidfVectorizer +from nltk.corpus import stopwords +import numpy as np +import pandas as pd +import string + +# Suppression de l'affichage des messages d'avertissement +import warnings +warnings.filterwarnings('ignore') +# Pour éviter l'affichage tronqué des descriptions +pd.set_option('display.max_colwidth', -1) + +def print_csvPath(): + value = tk.filedialog.askdirectory() + var_file1.set(value) + +def pretreatment(file): + path = file + all_files = glob.glob(path + "/*.csv") + li = [] + + for filename in all_files: + df = pd.read_csv(filename, sep="\t", index_col=None, header=0,error_bad_lines=False) + li.append(df) + + # Fusionner les fichiers csv dans un seul dataframe + df_ngrams = pd.concat(li, axis=0, ignore_index=True) + df_ngrams = df_ngrams.drop_duplicates(subset=['Declaration']) + df_ngrams = df_ngrams.drop(df_ngrams[df_ngrams['Position'] == ' Pas de prise de position référencée pour les animaux '].index) + df_ngrams.index = range(len(df_ngrams)) + position = df_ngrams['Position'] + df_ngrams.loc[position.str.contains('agi pour'), 'Position'] = 'agi pour' + df_ngrams.loc[position.str.contains('agi contre'), 'Position'] = 'agi contre' + df_ngrams.loc[position.str.contains('penché pour'), 'Position'] = 'penché pour' + df_ngrams.loc[position.str.contains('penché contre'), 'Position'] = 'penché contre' + df_ngrams['Position'] = position + return df_ngrams + +def divide_dataset_pos_ngrams(df): + X = df['Declaration'] + y_pos = df['Position'] + return train_test_split(X, y_pos, test_size=0.2) + +def divide_dataset_thm_ngrams(df): + X = df['Declaration'] + y_thm = df['Thematique'] + return train_test_split(X, y_thm, test_size=0.2) + +def predict_ngrams(): + t.delete("1.0","end") + + ''' + initialiser word vectorizer + ''' + mini = num1.get() + maxi = num2.get() + + nltk_stopwords = stopwords.words('french') + list(string.punctuation) + + count_vectorizer = CountVectorizer(lowercase=True, + ngram_range=(mini,maxi), + stop_words=nltk_stopwords, + max_features=5000) + + + tfidf_vectorizer = TfidfVectorizer(lowercase=True, + ngram_range=(mini,maxi), + stop_words=nltk_stopwords, + min_df=0.01, + max_features=5000) + + ''' + choisir une méthode + ''' + vectorizer = vectorizers.get() + if vectorizer == 'CountVectorizer': + word_vectorizer = count_vectorizer + elif vectorizer == 'TfidfVectorizer': + word_vectorizer = tfidf_vectorizer + + method = methods.get() + if method == 'LogisticRegression': + classifier_pipeline = make_pipeline(word_vectorizer,LogisticRegression()) + elif method == 'RandomForestClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,RandomForestClassifier()) + elif method == 'MultinomialNB': + classifier_pipeline = make_pipeline(word_vectorizer,MultinomialNB()) + elif method == 'DummyClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,DummyClassifier()) + elif method == 'KNeighborsClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,KNeighborsClassifier()) + elif method == 'DecisionTreeClassifier': + classifier_pipeline = make_pipeline(word_vectorizer,DecisionTreeClassifier()) + + ''' + le code de l'apprentissage automatique + ''' + file = var_file1.get() + df_ngrams = pretreatment(file) + cv = cross_validation.get() + + if cv == 'No': + Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams) + Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams) + + classifier_pipeline.fit(Xpn_train, ypn_train) + pos_predictions = classifier_pipeline.predict(Xpn_test) + classifier_pipeline.fit(Xtn_train, ytn_train) + thm_predictions = classifier_pipeline.predict(Xtn_test) + + t.insert('end',f"position accuracy : {accuracy_score(ypn_test, pos_predictions)}\n") + labels_pos = np.unique(ypn_test) + cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{confusion_df_pos}\n") + t.insert('end',f"(row=expected, col=predicted)\n\n") + t.insert('end',f"{classification_report(ypn_test, pos_predictions, target_names=labels_pos)}\n") + t.insert('end',"-------------------------------------------------\n") + t.insert('end',f"theme accuracy : {accuracy_score(ytn_test,thm_predictions)}\n") + labels_thm = np.unique(ytn_test) + t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{classification_report(ytn_test, thm_predictions, target_names=labels_thm)}\n") + else: + kfold = model_selection.KFold(n_splits=2, shuffle=True) + + X = df_ngrams['Declaration'] + y_pos = df_ngrams['Position'] + y_thm = df_ngrams['Thematique'] + + pos_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_pos, + cv=kfold) + thm_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_thm, + cv=kfold) + + t.insert('end',f"position accuracy : {accuracy_score(y_pos, pos_predictions)}\n") + labels_pos = np.unique(y_pos) + cm_pos = confusion_matrix(y_pos, pos_predictions, labels=labels_pos) + confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos) + t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{confusion_df_pos}\n") + t.insert('end',f"(row=expected, col=predicted)\n\n") + t.insert('end',f"{classification_report(y_pos, pos_predictions, target_names=labels_pos)}\n") + t.insert('end',"-------------------------------------------------\n") + t.insert('end',f"theme accuracy : {accuracy_score(y_thm,thm_predictions)}\n") + labels_thm = np.unique(y_thm) + t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{classification_report(y_thm, thm_predictions, target_names=labels_thm)}\n") + +''' +GUI code +''' +window = tk.Tk() +window.title('N-grams') +window.geometry('1280x720') + +methods = tk.StringVar() +m1 = tk.Radiobutton(window, text='LogisticRegression', variable=methods, value='LogisticRegression').place(x=50,y=130) +m2 = tk.Radiobutton(window, text='RandomForestClassifier', variable=methods, value='RandomForestClassifier').place(x=50,y=160) +m3 = tk.Radiobutton(window, text='MultinomialNB', variable=methods, value='MultinomialNB').place(x=50,y=190) +m4 = tk.Radiobutton(window, text='KNeighborsClassifier', variable=methods, value='KNeighborsClassifier').place(x=50,y=220) +m5 = tk.Radiobutton(window, text='DecisionTreeClassifier', variable=methods, value='DecisionTreeClassifier').place(x=50,y=250) +m6 = tk.Radiobutton(window, text='DummyClassifier', variable=methods, value='DummyClassifier').place(x=50,y=280) +methods.set('LogisticRegression') + +vectorizers = tk.StringVar() +v1 = tk.Radiobutton(window, text='CountVectorizer', variable=vectorizers, value='CountVectorizer').place(x=50,y=340) +v1 = tk.Radiobutton(window, text='TfidfVectorizer', variable=vectorizers, value='TfidfVectorizer').place(x=50,y=370) +vectorizers.set('CountVectorizer') + +cross_validation = tk.StringVar() +c1 = tk.Radiobutton(window, text='Yes', variable=cross_validation, value='Yes').place(x=50,y=490) +c2 = tk.Radiobutton(window, text='No', variable=cross_validation, value='No').place(x=150,y=490) +cross_validation.set('Yes') + +num1 = tk.IntVar(value=1) +n1 = tk.Entry(window,textvariable=num1,width=1).place(x=50,y=430) +num2 = tk.IntVar(value=3) +n2 = tk.Entry(window,textvariable=num2,width=1).place(x=100,y=430) + +tk.Button(window, text='Open', width=8, height=1, command=print_csvPath).place(x=160,y=50) +tk.Button(window,text='Predict with N-grams',font=('calibri',12),width=30,height=1,command=predict_ngrams).place(x=45,y=550) + +tk.Label(window,text='CSVs directory : ').place(x=10,y=10) +tk.Label(window,text='Methods : ').place(x=10,y=100) +tk.Label(window,text='Vectorizers : ').place(x=10,y=310) +tk.Label(window,text='N-grams range : ').place(x=10,y=400) +tk.Label(window,text='to').place(x=70,y=430) +tk.Label(window,text='Cross validation : ').place(x=10,y=460) + +var_file1 = tk.StringVar() +tk.Entry(window,textvariable=var_file1,width=25).place(x=135,y=10) + +t = tk.Text(window, height=41,width=96) +t.place(x=400,y=10) + +window.mainloop() \ No newline at end of file