fixminer_source/experiments/random_forest_model.py

import numpy as np
import pandas as pd
import joblib
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
def print_prec(y_test, y_pred):
    print(f'y Test Label: {np.array([int(x) for x in y_test])}')
    print(f'y Prediction: {np.array([int(x) for x in y_pred])}')

    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    acc = accuracy_score(y_test,y_pred)
    print(f'Precision: {prec * 100:0.1f}')
    print(f'Recall:    {rec * 100:0.1f}')
    print(f'Accuracy:  {acc*100:0.1f}')
    print(f'F1:        {f1 * 100:0.1f}')


if __name__ == '__main__':
    fm = pd.read_csv('fm-features.csv')
    d4j = pd.read_csv('d4j-features.csv')
    bdj = pd.read_csv('bdj-features.csv')
    all = pd.read_csv('data-all.absolute-features.csv')
    #X = fm.drop(columns=['LABEL','Time'],axis = 1)  # features
    #Y = fm['LABEL']  # labels
    clf = RandomForestClassifier()
    rs= RandomOverSampler(random_state=0)
    Y_all_test=all['LABEL']
    X_all_test=all.drop(columns=['LABEL','Time'],axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X_all_test, Y_all_test, test_size=0.2)
    #print(X_train)
    X_resampled, y_resampled = rs.fit_resample(X_train, y_train)
    clf.fit(X_resampled,y_resampled)

    #clf.fit(X, Y)
    # Y_bdj_test=bdj['LABEL']
    # X_bdj_test=bdj.drop(columns=['LABEL','Time'],axis = 1)
    # Y_d4j_test=d4j['LABEL']
    # X_d4j_test=d4j.drop(columns=['LABEL','Time'],axis = 1)
    # # bdj_pred=clf.predict(X_bdj_test)
    # # d4j_pred=clf.predict(X_d4j_test)
    all_pred=clf.predict(X_test)
    # # print_prec(Y_bdj_test,bdj_pred)
    # # print_prec(Y_d4j_test,d4j_pred)
    print_prec(y_test,all_pred)
    # 800 667 667 727
    joblib.dump(clf, "./random_forest.joblib")
    importance=[0.04336795, 0.11223876, 0.0586915,  0.03840793, 0.07281997, 0.06005508,
                0.08044522, 0.05864273, 0.07693864, 0.08805545, 0.05911849, 0.09307329,
                0.0981109,  0.06003408,]
    features = X_all_test.columns
    print(features)
    print(importance)
    print(clf.feature_importances_)
    # X_train, X_test, y_train, y_test = train_test_split(X_bdj_test, Y_bdj_test, test_size=0.33, random_state=42)
    # clf.fit(X_train,y_train)
    #
    # bdj_pred=clf.predict(X_test)
    # d4j_pred=clf.predict(X_d4j_test)
    # all_pred=clf.predict(X_all_test)
    # print_prec(y_test,bdj_pred)
    # print_prec(Y_d4j_test,d4j_pred)
    # print_prec(Y_all_test,all_pred)
    #
    #
    # X_train, X_test, y_train, y_test = train_test_split(X_d4j_test, Y_d4j_test, test_size=0.33, random_state=42)
    # clf.fit(X_train,y_train)
    #
    # bdj_pred=clf.predict(X_bdj_test)
    # d4j_pred=clf.predict(X_test)
    # all_pred=clf.predict(X_all_test)
    # print_prec(Y_bdj_test,bdj_pred)
    # print_prec(y_test,d4j_pred)
    # print_prec(Y_all_test,all_pred)

    # print(bdj.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])
    # print(d4j.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])
    # print(fm.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])