diff --git a/ngram.py b/ngram.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd0a61b42b877c35af02a3e892164dfd074259d
--- /dev/null
+++ b/ngram.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+
+from nltk.corpus import stopwords
+from sklearn import metrics
+from sklearn import model_selection
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
+from sklearn.tree import DecisionTreeClassifier
+import spacy
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import string
+
+# Suppression de l'affichage des messages d'avertissement
+import warnings
+warnings.filterwarnings('ignore')
+# Pour éviter l'affichage tronqué des descriptions
+pd.set_option('display.max_colwidth', -1) 
+
+def correct_file():
+    df = pd.read_csv("C:\\Users\\18664\\Desktop\\projet tutore\\apprentissage-master\\export_dataframe.csv", sep="\t")
+    
+    '''
+    correct the wrong columns
+    '''
+    a = list(df.columns)
+    b = a[1:5]
+    c = a[5:]
+    d = c + b
+    d.insert(0,a[0])
+    df.columns = d
+    for i in range(0,4):
+        last_col = df.pop(df.columns[-1])
+        df.insert(1, last_col.name, last_col)
+    df.to_csv(r'C:\Users\18664\Desktop\projet tutore\export_dataframe1.csv', sep='\t' )
+    
+def import_file(file='C:\\Users\\18664\\Desktop\\projet tutore\\export_dataframe1.csv'):
+    df = pd.read_csv(file, sep=",")
+    df = df.dropna(how = 'any')
+    return df
+
+def divide_dataset_pos(df):
+    X = df.iloc[:,3:]    
+    y_pos = df['P_position']
+    return train_test_split(X, y_pos, test_size=0.4)
+    
+def divide_dataset_thm(df):
+    X = df.iloc[:,3:]
+    y_theme = df['T_theme']
+    return train_test_split(X, y_theme, test_size=0.4)
+
+def divide_dataset_pos_ngrams(df):
+    X = df['Declaration']
+    y_pos = df['Position']
+    return train_test_split(X, y_pos, test_size=0.4)
+
+def divide_dataset_thm_ngrams(df):
+    X = df['Declaration']
+    y_pos = df['Thematique']
+    return train_test_split(X, y_pos, test_size=0.4)
+
+def predict(method):
+    description = str(method)
+    
+    classifier_pipeline = make_pipeline(method)
+    
+    classifier_pipeline.fit(Xp_train, yp_train)
+    pos_predictions = classifier_pipeline.predict(Xp_test)
+    
+    classifier_pipeline.fit(Xt_train, yt_train)
+    thm_predictions = classifier_pipeline.predict(Xt_test)
+    
+    print('accuracy de position', accuracy_score(yp_test, pos_predictions))
+    labels_pos = np.unique(yp_test)
+    cm_pos = confusion_matrix(yp_test, pos_predictions, labels=labels_pos)
+    confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+    print(f'confusion matrix de {description}\n', confusion_df_pos)
+    print('(row=expected, col=predicted)')
+    print(classification_report(yp_test, pos_predictions, target_names=labels_pos))
+    
+    print('accuracy de thème', accuracy_score(yt_test,thm_predictions))
+    labels_thm = np.unique(yt_test)
+    cm_thm = confusion_matrix(yt_test, thm_predictions, labels=labels_thm)
+    confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm)
+    print(f'confusion matrix de {description}\n', confusion_df_thm)
+    print('(row=expected, col=predicted)')
+    print(classification_report(yt_test, thm_predictions, target_names=labels_thm))
+    
+def predict_ngrams(method,i=2):
+    word_vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,i), 
+                                  analyzer='word',
+                                  max_features=1000,
+                                 binary = True)
+    
+    description = str(method)
+    
+    classifier_pipeline = make_pipeline(word_vectorizer,method)
+    
+    classifier_pipeline.fit(Xpn_train, ypn_train)
+    pos_predictions = classifier_pipeline.predict(Xpn_test)
+    
+    classifier_pipeline.fit(Xtn_train, ytn_train)
+    thm_predictions = classifier_pipeline.predict(Xtn_test)
+    
+    print('accuracy de position', accuracy_score(ypn_test, pos_predictions))
+    labels_pos = np.unique(ypn_test)
+    cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos)
+    confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+    print(f'confusion matrix de {description}\n', confusion_df_pos)
+    print('(row=expected, col=predicted)')
+    print(classification_report(ypn_test, pos_predictions, target_names=labels_pos))
+    
+    print('accuracy de thème', accuracy_score(ytn_test,thm_predictions))
+    labels_thm = np.unique(ytn_test)
+    cm_thm = confusion_matrix(ytn_test, thm_predictions, labels=labels_thm)
+    confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm)
+    print(f'confusion matrix de {description}\n', confusion_df_thm)
+    print('(row=expected, col=predicted)')
+    print(classification_report(ytn_test, thm_predictions, target_names=labels_thm))
+
+if __name__ == '__main__':
+    df = import_file()
+    df_ngrams = import_file('C:\\Users\\18664\\Desktop\\projet tutore\\1.csv')
+    Xp_train, Xp_test, yp_train, yp_test = divide_dataset_pos(df)
+    Xt_train, Xt_test, yt_train, yt_test = divide_dataset_thm(df)
+    Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams)
+    Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams)
+    #predict(LogisticRegression())
+    #predict_ngrams(LogisticRegression(),2)
+    #predict_ngrams(LogisticRegression(),3)
+    #predict(RandomForestClassifier())
+    #predict_ngrams(RandomForestClassifier(),2)
+    #predict_ngrams(RandomForestClassifier(),3)
+    #predict(MultinomialNB())
+    #predict_ngrams(MultinomialNB(),2)
+    #predict_ngrams(MultinomialNB(),3)
\ No newline at end of file
diff --git a/ngramsGUI.py b/ngramsGUI.py
new file mode 100644
index 0000000000000000000000000000000000000000..65357d3c694535b6652c13c4b09ff8e755cc1eac
--- /dev/null
+++ b/ngramsGUI.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+
+import tkinter as tk
+import glob
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.dummy import DummyClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import make_pipeline
+import numpy as np
+import pandas as pd
+
+# Suppression de l'affichage des messages d'avertissement
+import warnings
+warnings.filterwarnings('ignore')
+# Pour éviter l'affichage tronqué des descriptions
+pd.set_option('display.max_colwidth', -1) 
+
+def pretreatment(file):
+    path = file
+    all_files = glob.glob(path + "/*.csv")
+    li = []
+    
+    for filename in all_files:
+        df = pd.read_csv(filename, sep="\t", index_col=None, header=0,error_bad_lines=False)
+        li.append(df)
+    
+    ## Fusionner les fichiers csv dans un seul dataframe ##
+    df_ngrams = pd.concat(li, axis=0, ignore_index=True)
+    df_ngrams = df_ngrams.drop_duplicates(subset=['Declaration'])
+    df_ngrams = df_ngrams.drop(df_ngrams[df_ngrams['Position'] == '            Pas de prise de position référencée pour les animaux          '].index)
+    df_ngrams.index = range(len(df_ngrams))
+    position = df_ngrams['Position']
+    position = position.replace(['            a agi pour les animaux          ','            ont agi pour les animaux          '],'agi pour')
+    position = position.replace(['            a agi contre les animaux          ','            ont agi contre les animaux          '],'agi contre')
+    position = position.replace(['            a penché pour les animaux          ','            ont penché pour les animaux          '],'penché pour')
+    position = position.replace(['            a penché contre les animaux          ','            ont penché contre les animaux          '],'penché contre')
+    df_ngrams['Position'] = position
+    return df_ngrams
+    
+def import_file(file):
+    df = pd.read_csv(file, sep=",")
+    df = df.dropna(how = 'any')
+    return df
+
+def divide_dataset_pos(df):
+    X = df.iloc[:,3:]    
+    y_pos = df['P_position']
+    return train_test_split(X, y_pos, test_size=0.2)
+    
+def divide_dataset_thm(df):
+    X = df.iloc[:,3:]
+    y_theme = df['T_theme']
+    return train_test_split(X, y_theme, test_size=0.2)
+
+def divide_dataset_pos_ngrams(df):
+    X = df['Declaration']
+    y_pos = df['Position']
+    return train_test_split(X, y_pos, test_size=0.2)
+
+def divide_dataset_thm_ngrams(df):
+    X = df['Declaration']
+    y_pos = df['Thematique']
+    return train_test_split(X, y_pos, test_size=0.2)
+
+def predict():
+    '''
+    choose method
+    '''
+    method = methods.get()
+    if method == 'LogisticRegression':
+        classifier_pipeline = make_pipeline(LogisticRegression())
+    elif method == 'RandomForestClassifier':
+        classifier_pipeline = make_pipeline(RandomForestClassifier())
+    elif method == 'MultinomialNB':
+        classifier_pipeline = make_pipeline(MultinomialNB())
+    
+    '''
+    machine learning code
+    '''
+    classifier_pipeline.fit(Xp_train, yp_train)
+    pos_predictions = classifier_pipeline.predict(Xp_test)
+    
+    classifier_pipeline.fit(Xt_train, yt_train)
+    thm_predictions = classifier_pipeline.predict(Xt_test)
+    
+    t.insert('end',f"accuracy de position: {accuracy_score(yp_test, pos_predictions)}\n")
+    labels_pos = np.unique(yp_test)
+    cm_pos = confusion_matrix(yp_test, pos_predictions, labels=labels_pos)
+    confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+    t.insert('end',f"confusion matrix de {method}\n{confusion_df_pos}\n")
+    t.insert('end',f"(row=expected, col=predicted)\n")
+    t.insert('end',f"{classification_report(yp_test, pos_predictions, target_names=labels_pos)}\n")
+    
+    t.insert('end',f"accuracy de thème: {accuracy_score(yt_test,thm_predictions)}\n")
+    labels_thm = np.unique(yt_test)
+    cm_thm = confusion_matrix(yt_test, thm_predictions, labels=labels_thm)
+    confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm)
+    t.insert('end',f"confusion matrix de {method}\n{confusion_df_thm}\n")
+    t.insert('end',f"(row=expected, col=predicted)\n")
+    t.insert('end',f"{classification_report(yt_test, thm_predictions, target_names=labels_thm)}\n")
+    
+def predict_ngrams():
+    '''
+    initialize word vectorizer
+    '''
+    i = num.get()
+    word_vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,i), 
+                                  analyzer='word',
+                                  max_features=5000,
+                                  binary=False)
+    
+    '''
+    choose methode
+    '''
+    method = methods.get()
+    if method == 'LogisticRegression':
+        classifier_pipeline = make_pipeline(word_vectorizer,LogisticRegression())
+    elif method == 'RandomForestClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,RandomForestClassifier())
+    elif method == 'MultinomialNB':
+        classifier_pipeline = make_pipeline(word_vectorizer,MultinomialNB())
+    elif method == 'DummyClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,DummyClassifier())
+    elif method == 'KNeighborsClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,KNeighborsClassifier())
+    elif method == 'DecisionTreeClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,DecisionTreeClassifier())
+    
+    '''
+    machine learning code
+    '''
+    file = var_file1.get()
+    df_ngrams = pretreatment(file)
+    Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams)
+    Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams)
+    classifier_pipeline.fit(Xpn_train, ypn_train)
+    pos_predictions = classifier_pipeline.predict(Xpn_test)
+    classifier_pipeline.fit(Xtn_train, ytn_train)
+    thm_predictions = classifier_pipeline.predict(Xtn_test)
+    
+    t.insert('end',f"accuracy de position: {accuracy_score(ypn_test, pos_predictions)}\n")
+    labels_pos = np.unique(ypn_test)
+    cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos)
+    confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+    t.insert('end',f"confusion matrix de {method}, ngrams:{i}\n{confusion_df_pos}\n")
+    t.insert('end',f"(row=expected, col=predicted)\n")
+    t.insert('end',f"{classification_report(ypn_test, pos_predictions, target_names=labels_pos)}\n")
+    
+    t.insert('end',f"accuracy de thème: {accuracy_score(ytn_test,thm_predictions)}\n")
+    labels_thm = np.unique(ytn_test)
+    cm_thm = confusion_matrix(ytn_test, thm_predictions, labels=labels_thm)
+    confusion_df_thm = pd.DataFrame(cm_thm, index=labels_thm, columns=labels_thm)
+    t.insert('end',f"confusion matrix de {method}, ngrams:{i}\n{confusion_df_thm}\n")
+    t.insert('end',f"(row=expected, col=predicted)\n")
+    t.insert('end',f"{classification_report(ytn_test, thm_predictions, target_names=labels_thm)}\n")
+    
+    #a = make_pipeline(word_vectorizer)
+    #res = a.transform(Xpn_test)
+    #bow = pd.DataFrame(res.toarray(), columns=word_vectorizer.get_feature_names())
+    #print(bow)
+    
+def execute():
+    #file1 = var_file1.get()
+    #file2 = var_file2.get()
+    #df = import_file(file1)
+    df_ngrams = pretreatment()
+    #Xp_train, Xp_test, yp_train, yp_test = divide_dataset_pos(df)
+    #Xt_train, Xt_test, yt_train, yt_test = divide_dataset_thm(df)
+    Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams)
+    Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams)
+    t.insert('end',"done\n")
+
+'''
+GUI code
+'''
+window = tk.Tk()
+window.title('ngram')
+window.geometry('1280x720')
+
+num_ngrams = (1,2,3)
+
+methods = tk.StringVar()
+m1 = tk.Radiobutton(window, text='LogisticRegression', variable=methods, value='LogisticRegression').place(x=50,y=460)
+m2 = tk.Radiobutton(window, text='RandomForestClassifier', variable=methods, value='RandomForestClassifier').place(x=50,y=510)
+m3 = tk.Radiobutton(window, text='MultinomialNB', variable=methods, value='MultinomialNB').place(x=50,y=560)
+m4 = tk.Radiobutton(window, text='KNeighborsClassifier', variable=methods, value='KNeighborsClassifier').place(x=50,y=610)
+m5 = tk.Radiobutton(window, text='DecisionTreeClassifier', variable=methods, value='DecisionTreeClassifier').place(x=50,y=660)
+m6 = tk.Radiobutton(window, text='DummyClassifier', variable=methods, value='DummyClassifier').place(x=50,y=710)
+
+num = tk.IntVar()
+n1 = tk.Radiobutton(window, text='1', variable=num, value=1).place(x=50,y=810)
+n2 = tk.Radiobutton(window, text='2', variable=num, value=2).place(x=50,y=860)
+n3 = tk.Radiobutton(window, text='3', variable=num, value=3).place(x=50,y=910)
+
+#tk.Button(window,text='Pretreat',font=('calibri',12),width=15,height=1,command=execute).place(x=200,y=110)
+#tk.Button(window,text='Predict with bag of words',font=('calibri',12),width=30,height=1,command=predict).place(x=50,y=215)
+tk.Button(window,text='Predict with ngrams',font=('calibri',12),width=30,height=1,command=predict_ngrams).place(x=50,y=315)
+
+tk.Label(window,text='data files :').place(x=10,y=10)
+#tk.Label(window,text='csv file(ngrams) :').place(x=10,y=60)
+#tk.Label(window,text='Pretreatment : ').place(x=10,y=115)
+tk.Label(window,text='Methods : ').place(x=10,y=415)
+tk.Label(window,text='Ngrams number : ').place(x=10,y=765)
+
+var_file1 = tk.StringVar()
+entry_file1 = tk.Entry(window,textvariable=var_file1)
+entry_file1.place(x=200,y=10)
+
+#var_file2 = tk.StringVar()
+#entry_file2 = tk.Entry(window,textvariable=var_file2)
+#entry_file2.place(x=200,y=60)
+
+t = tk.Text(window, height=50,width=165)
+t.place(x=400,y=10)
+
+window.mainloop()
\ No newline at end of file
diff --git a/ngramsGUI_UltraSuperInvincible.py b/ngramsGUI_UltraSuperInvincible.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2de94bacc087e9b769b3510dfa85b30c524aab7
--- /dev/null
+++ b/ngramsGUI_UltraSuperInvincible.py
@@ -0,0 +1,214 @@
+# -*- coding: utf-8 -*-
+
+import tkinter as tk
+import glob
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.dummy import DummyClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn import model_selection
+from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import make_pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.corpus import stopwords
+import numpy as np
+import pandas as pd
+import string
+
+# Suppression de l'affichage des messages d'avertissement
+import warnings
+warnings.filterwarnings('ignore')
+# Pour éviter l'affichage tronqué des descriptions
+pd.set_option('display.max_colwidth', -1) 
+
+def print_csvPath():
+     value = tk.filedialog.askdirectory()
+     var_file1.set(value)
+
+def pretreatment(file):
+    path = file
+    all_files = glob.glob(path + "/*.csv")
+    li = []
+    
+    for filename in all_files:
+        df = pd.read_csv(filename, sep="\t", index_col=None, header=0,error_bad_lines=False)
+        li.append(df)
+    
+    ## Fusionner les fichiers csv dans un seul dataframe ##
+    df_ngrams = pd.concat(li, axis=0, ignore_index=True)
+    df_ngrams = df_ngrams.drop_duplicates(subset=['Declaration'])
+    df_ngrams = df_ngrams.drop(df_ngrams[df_ngrams['Position'] == '            Pas de prise de position référencée pour les animaux          '].index)
+    df_ngrams.index = range(len(df_ngrams))
+    position = df_ngrams['Position']
+    df_ngrams.loc[position.str.contains('agi pour'), 'Position'] = 'agi pour'
+    df_ngrams.loc[position.str.contains('agi contre'), 'Position'] = 'agi contre'
+    df_ngrams.loc[position.str.contains('penché pour'), 'Position'] = 'penché pour'
+    df_ngrams.loc[position.str.contains('penché contre'), 'Position'] = 'penché contre'   
+    df_ngrams['Position'] = position
+    return df_ngrams
+
+    
+def import_file(file):
+    df = pd.read_csv(file, sep=",")
+    df = df.dropna(how = 'any')
+    return df
+
+def divide_dataset_pos_ngrams(df):
+    X = df['Declaration']
+    y_pos = df['Position']
+    return train_test_split(X, y_pos, test_size=0.2)
+
+def divide_dataset_thm_ngrams(df):
+    X = df['Declaration']
+    y_thm = df['Thematique']
+    return train_test_split(X, y_thm, test_size=0.2)
+    
+def predict_ngrams():
+    t.delete("1.0","end")
+    
+    '''
+    initialize word vectorizer
+    '''
+    mini = num1.get()
+    maxi = num2.get()
+    
+    nltk_stopwords = stopwords.words('french') + list(string.punctuation)
+    
+    count_vectorizer = CountVectorizer(lowercase=True,
+                                       ngram_range=(mini,maxi), 
+                                       stop_words=nltk_stopwords,
+                                       max_features=5000)
+    
+    
+    tfidf_vectorizer = TfidfVectorizer(lowercase=True,
+                                       ngram_range=(mini,maxi),
+                                       stop_words=nltk_stopwords,
+                                       min_df=0.01,
+                                       max_features=5000)
+    
+    '''
+    choose methode
+    '''
+    vectorizer = vectorizers.get()
+    if vectorizer == 'CountVectorizer':
+        word_vectorizer = count_vectorizer
+    elif vectorizer == 'TfidfVectorizer':
+        word_vectorizer = tfidf_vectorizer
+    
+    method = methods.get()
+    if method == 'LogisticRegression':
+        classifier_pipeline = make_pipeline(word_vectorizer,LogisticRegression())
+    elif method == 'RandomForestClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,RandomForestClassifier())
+    elif method == 'MultinomialNB':
+        classifier_pipeline = make_pipeline(word_vectorizer,MultinomialNB())
+    elif method == 'DummyClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,DummyClassifier())
+    elif method == 'KNeighborsClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,KNeighborsClassifier())
+    elif method == 'DecisionTreeClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,DecisionTreeClassifier())
+    
+    '''
+    machine learning code
+    '''
+    file = var_file1.get()
+    df_ngrams = pretreatment(file)
+    cv = cross_validation.get()
+    
+    if cv == 'No':
+        Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams)
+        Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams)
+        
+        classifier_pipeline.fit(Xpn_train, ypn_train)
+        pos_predictions = classifier_pipeline.predict(Xpn_test)
+        classifier_pipeline.fit(Xtn_train, ytn_train)
+        thm_predictions = classifier_pipeline.predict(Xtn_test)
+        
+        t.insert('end',f"position accuracy : {accuracy_score(ypn_test, pos_predictions)}\n")
+        labels_pos = np.unique(ypn_test)
+        cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos)
+        confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+        t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{confusion_df_pos}\n")
+        t.insert('end',f"(row=expected, col=predicted)\n\n")
+        t.insert('end',f"{classification_report(ypn_test, pos_predictions, target_names=labels_pos)}\n")
+        t.insert('end',"-------------------------------------------------\n")
+        t.insert('end',f"theme accuracy : {accuracy_score(ytn_test,thm_predictions)}\n")
+        labels_thm = np.unique(ytn_test)
+        t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{classification_report(ytn_test, thm_predictions, target_names=labels_thm)}\n")
+    else:
+        kfold = model_selection.KFold(n_splits=2, shuffle=True)
+        
+        X = df_ngrams['Declaration']
+        y_pos = df_ngrams['Position']
+        y_thm = df_ngrams['Thematique']
+        
+        pos_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_pos,
+                                                   cv=kfold)
+        thm_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_thm,
+                                                   cv=kfold)
+        
+        t.insert('end',f"position accuracy : {accuracy_score(y_pos, pos_predictions)}\n")
+        labels_pos = np.unique(y_pos)
+        cm_pos = confusion_matrix(y_pos, pos_predictions, labels=labels_pos)
+        confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+        t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{confusion_df_pos}\n")
+        t.insert('end',f"(row=expected, col=predicted)\n\n")
+        t.insert('end',f"{classification_report(y_pos, pos_predictions, target_names=labels_pos)}\n")
+        t.insert('end',"-------------------------------------------------\n")
+        t.insert('end',f"theme accuracy : {accuracy_score(y_thm,thm_predictions)}\n")
+        labels_thm = np.unique(y_thm)
+        t.insert('end',f"{method}, ngrams:{mini}-{maxi}\n\n{classification_report(y_thm, thm_predictions, target_names=labels_thm)}\n")
+
+'''
+GUI code
+'''
+window = tk.Tk()
+window.title('N-grams')
+window.geometry('1280x720')
+
+methods = tk.StringVar()
+m1 = tk.Radiobutton(window, text='LogisticRegression', variable=methods, value='LogisticRegression').place(x=50,y=130)
+m2 = tk.Radiobutton(window, text='RandomForestClassifier', variable=methods, value='RandomForestClassifier').place(x=50,y=160)
+m3 = tk.Radiobutton(window, text='MultinomialNB', variable=methods, value='MultinomialNB').place(x=50,y=190)
+m4 = tk.Radiobutton(window, text='KNeighborsClassifier', variable=methods, value='KNeighborsClassifier').place(x=50,y=220)
+m5 = tk.Radiobutton(window, text='DecisionTreeClassifier', variable=methods, value='DecisionTreeClassifier').place(x=50,y=250)
+m6 = tk.Radiobutton(window, text='DummyClassifier', variable=methods, value='DummyClassifier').place(x=50,y=280)
+methods.set('LogisticRegression')
+
+vectorizers = tk.StringVar()
+v1 = tk.Radiobutton(window, text='CountVectorizer', variable=vectorizers, value='CountVectorizer').place(x=50,y=340)
+v1 = tk.Radiobutton(window, text='TfidfVectorizer', variable=vectorizers, value='TfidfVectorizer').place(x=50,y=370)
+vectorizers.set('CountVectorizer')
+
+cross_validation = tk.StringVar()
+c1 = tk.Radiobutton(window, text='Yes', variable=cross_validation, value='Yes').place(x=50,y=490)
+c2 = tk.Radiobutton(window, text='No', variable=cross_validation, value='No').place(x=150,y=490)
+cross_validation.set('Yes')
+
+num1 = tk.IntVar(value=1)
+n1 = tk.Entry(window,textvariable=num1,width=1).place(x=50,y=430)
+num2 = tk.IntVar(value=3)
+n2 = tk.Entry(window,textvariable=num2,width=1).place(x=100,y=430)
+
+tk.Button(window, text='Open', width=8, height=1, command=print_csvPath).place(x=160,y=50)
+tk.Button(window,text='Predict with N-grams',font=('calibri',12),width=30,height=1,command=predict_ngrams).place(x=45,y=550)
+
+tk.Label(window,text='CSVs directory : ').place(x=10,y=10)
+tk.Label(window,text='Methods : ').place(x=10,y=100)
+tk.Label(window,text='Vectorizers : ').place(x=10,y=310)
+tk.Label(window,text='N-grams range : ').place(x=10,y=400)
+tk.Label(window,text='to').place(x=70,y=430)
+tk.Label(window,text='Cross validation : ').place(x=10,y=460)
+
+var_file1 = tk.StringVar()
+tk.Entry(window,textvariable=var_file1,width=25).place(x=135,y=10)
+
+t = tk.Text(window, height=41,width=96)
+t.place(x=400,y=10)
+
+window.mainloop()
\ No newline at end of file
diff --git a/ngramsGUI_UltraSuperInvincibleMega.py b/ngramsGUI_UltraSuperInvincibleMega.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a46a047dcd446b21898d95dc9456f5ba8842f81
--- /dev/null
+++ b/ngramsGUI_UltraSuperInvincibleMega.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+'''
+Auteur : CHEN Tinghan
+
+Version : 1.00
+
+Ce script contient des fonctionnalités telles que la lecture de fichiers csv 
+bruts pour un prétraitement simple, l'apprentissage automatique à l'aide des 
+paramètres tels que les n-grammes, et une interface graphique.
+Les fichiers d'entrée sont des fichiers csv multiples dans le même dossier dans
+un format uniforme avec quatre colonnes : index, position, thématique et 
+déclaration.
+La sortie sera effectuée dans une interface graphique, le calcul prendra de
+30 secondes à 1 minute.
+'''
+import tkinter as tk
+import glob
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.dummy import DummyClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn import model_selection
+from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import make_pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.corpus import stopwords
+import numpy as np
+import pandas as pd
+import string
+
+# Suppression de l'affichage des messages d'avertissement
+import warnings
+warnings.filterwarnings('ignore')
+# Pour éviter l'affichage tronqué des descriptions
+pd.set_option('display.max_colwidth', -1) 
+
+def print_csvPath():
+     value = tk.filedialog.askdirectory()
+     var_file1.set(value)
+
+def pretreatment(file):
+    path = file
+    all_files = glob.glob(path + "/*.csv")
+    li = []
+    
+    for filename in all_files:
+        df = pd.read_csv(filename, sep="\t", index_col=None, header=0,error_bad_lines=False)
+        li.append(df)
+    
+    # Fusionner les fichiers csv dans un seul dataframe
+    df_ngrams = pd.concat(li, axis=0, ignore_index=True)
+    df_ngrams = df_ngrams.drop_duplicates(subset=['Declaration'])
+    df_ngrams = df_ngrams.drop(df_ngrams[df_ngrams['Position'] == '            Pas de prise de position référencée pour les animaux          '].index)
+    df_ngrams.index = range(len(df_ngrams))
+    position = df_ngrams['Position']
+    df_ngrams.loc[position.str.contains('agi pour'), 'Position'] = 'agi pour'
+    df_ngrams.loc[position.str.contains('agi contre'), 'Position'] = 'agi contre'
+    df_ngrams.loc[position.str.contains('penché pour'), 'Position'] = 'penché pour'
+    df_ngrams.loc[position.str.contains('penché contre'), 'Position'] = 'penché contre'   
+    df_ngrams['Position'] = position
+    return df_ngrams
+
+def divide_dataset_pos_ngrams(df):
+    X = df['Declaration']
+    y_pos = df['Position']
+    return train_test_split(X, y_pos, test_size=0.2)
+
+def divide_dataset_thm_ngrams(df):
+    X = df['Declaration']
+    y_thm = df['Thematique']
+    return train_test_split(X, y_thm, test_size=0.2)
+    
+def predict_ngrams():
+    t.delete("1.0","end")
+    
+    '''
+    initialiser word vectorizer
+    '''
+    mini = num1.get()
+    maxi = num2.get()
+    
+    nltk_stopwords = stopwords.words('french') + list(string.punctuation)
+    
+    count_vectorizer = CountVectorizer(lowercase=True,
+                                       ngram_range=(mini,maxi), 
+                                       stop_words=nltk_stopwords,
+                                       max_features=5000)
+    
+    
+    tfidf_vectorizer = TfidfVectorizer(lowercase=True,
+                                       ngram_range=(mini,maxi),
+                                       stop_words=nltk_stopwords,
+                                       min_df=0.01,
+                                       max_features=5000)
+    
+    '''
+    choisir une méthode
+    '''
+    vectorizer = vectorizers.get()
+    if vectorizer == 'CountVectorizer':
+        word_vectorizer = count_vectorizer
+    elif vectorizer == 'TfidfVectorizer':
+        word_vectorizer = tfidf_vectorizer
+    
+    method = methods.get()
+    if method == 'LogisticRegression':
+        classifier_pipeline = make_pipeline(word_vectorizer,LogisticRegression())
+    elif method == 'RandomForestClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,RandomForestClassifier())
+    elif method == 'MultinomialNB':
+        classifier_pipeline = make_pipeline(word_vectorizer,MultinomialNB())
+    elif method == 'DummyClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,DummyClassifier())
+    elif method == 'KNeighborsClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,KNeighborsClassifier())
+    elif method == 'DecisionTreeClassifier':
+        classifier_pipeline = make_pipeline(word_vectorizer,DecisionTreeClassifier())
+    
+    '''
+    le code de l'apprentissage automatique
+    '''
+    file = var_file1.get()
+    df_ngrams = pretreatment(file)
+    cv = cross_validation.get()
+    
+    if cv == 'No':
+        Xpn_train, Xpn_test, ypn_train, ypn_test = divide_dataset_pos_ngrams(df_ngrams)
+        Xtn_train, Xtn_test, ytn_train, ytn_test = divide_dataset_thm_ngrams(df_ngrams)
+        
+        classifier_pipeline.fit(Xpn_train, ypn_train)
+        pos_predictions = classifier_pipeline.predict(Xpn_test)
+        classifier_pipeline.fit(Xtn_train, ytn_train)
+        thm_predictions = classifier_pipeline.predict(Xtn_test)
+        
+        t.insert('end',f"position accuracy : {accuracy_score(ypn_test, pos_predictions)}\n")
+        labels_pos = np.unique(ypn_test)
+        cm_pos = confusion_matrix(ypn_test, pos_predictions, labels=labels_pos)
+        confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+        t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{confusion_df_pos}\n")
+        t.insert('end',f"(row=expected, col=predicted)\n\n")
+        t.insert('end',f"{classification_report(ypn_test, pos_predictions, target_names=labels_pos)}\n")
+        t.insert('end',"-------------------------------------------------\n")
+        t.insert('end',f"theme accuracy : {accuracy_score(ytn_test,thm_predictions)}\n")
+        labels_thm = np.unique(ytn_test)
+        t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{classification_report(ytn_test, thm_predictions, target_names=labels_thm)}\n")
+    else:
+        kfold = model_selection.KFold(n_splits=2, shuffle=True)
+        
+        X = df_ngrams['Declaration']
+        y_pos = df_ngrams['Position']
+        y_thm = df_ngrams['Thematique']
+        
+        pos_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_pos,
+                                                   cv=kfold)
+        thm_predictions = model_selection.cross_val_predict(classifier_pipeline, X, y_thm,
+                                                   cv=kfold)
+        
+        t.insert('end',f"position accuracy : {accuracy_score(y_pos, pos_predictions)}\n")
+        labels_pos = np.unique(y_pos)
+        cm_pos = confusion_matrix(y_pos, pos_predictions, labels=labels_pos)
+        confusion_df_pos = pd.DataFrame(cm_pos, index=labels_pos, columns=labels_pos)
+        t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{confusion_df_pos}\n")
+        t.insert('end',f"(row=expected, col=predicted)\n\n")
+        t.insert('end',f"{classification_report(y_pos, pos_predictions, target_names=labels_pos)}\n")
+        t.insert('end',"-------------------------------------------------\n")
+        t.insert('end',f"theme accuracy : {accuracy_score(y_thm,thm_predictions)}\n")
+        labels_thm = np.unique(y_thm)
+        t.insert('end',f"{method}, N-grams:{mini}-{maxi}\n\n{classification_report(y_thm, thm_predictions, target_names=labels_thm)}\n")
+
+'''
+GUI code
+'''
+window = tk.Tk()
+window.title('N-grams')
+window.geometry('1280x720')
+
+methods = tk.StringVar()
+m1 = tk.Radiobutton(window, text='LogisticRegression', variable=methods, value='LogisticRegression').place(x=50,y=130)
+m2 = tk.Radiobutton(window, text='RandomForestClassifier', variable=methods, value='RandomForestClassifier').place(x=50,y=160)
+m3 = tk.Radiobutton(window, text='MultinomialNB', variable=methods, value='MultinomialNB').place(x=50,y=190)
+m4 = tk.Radiobutton(window, text='KNeighborsClassifier', variable=methods, value='KNeighborsClassifier').place(x=50,y=220)
+m5 = tk.Radiobutton(window, text='DecisionTreeClassifier', variable=methods, value='DecisionTreeClassifier').place(x=50,y=250)
+m6 = tk.Radiobutton(window, text='DummyClassifier', variable=methods, value='DummyClassifier').place(x=50,y=280)
+methods.set('LogisticRegression')
+
+vectorizers = tk.StringVar()
+v1 = tk.Radiobutton(window, text='CountVectorizer', variable=vectorizers, value='CountVectorizer').place(x=50,y=340)
+v1 = tk.Radiobutton(window, text='TfidfVectorizer', variable=vectorizers, value='TfidfVectorizer').place(x=50,y=370)
+vectorizers.set('CountVectorizer')
+
+cross_validation = tk.StringVar()
+c1 = tk.Radiobutton(window, text='Yes', variable=cross_validation, value='Yes').place(x=50,y=490)
+c2 = tk.Radiobutton(window, text='No', variable=cross_validation, value='No').place(x=150,y=490)
+cross_validation.set('Yes')
+
+num1 = tk.IntVar(value=1)
+n1 = tk.Entry(window,textvariable=num1,width=1).place(x=50,y=430)
+num2 = tk.IntVar(value=3)
+n2 = tk.Entry(window,textvariable=num2,width=1).place(x=100,y=430)
+
+tk.Button(window, text='Open', width=8, height=1, command=print_csvPath).place(x=160,y=50)
+tk.Button(window,text='Predict with N-grams',font=('calibri',12),width=30,height=1,command=predict_ngrams).place(x=45,y=550)
+
+tk.Label(window,text='CSVs directory : ').place(x=10,y=10)
+tk.Label(window,text='Methods : ').place(x=10,y=100)
+tk.Label(window,text='Vectorizers : ').place(x=10,y=310)
+tk.Label(window,text='N-grams range : ').place(x=10,y=400)
+tk.Label(window,text='to').place(x=70,y=430)
+tk.Label(window,text='Cross validation : ').place(x=10,y=460)
+
+var_file1 = tk.StringVar()
+tk.Entry(window,textvariable=var_file1,width=25).place(x=135,y=10)
+
+t = tk.Text(window, height=41,width=96)
+t.place(x=400,y=10)
+
+window.mainloop()
\ No newline at end of file