82 lines
3.2 KiB
Python
82 lines
3.2 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import joblib
|
|
from sklearn.datasets import make_classification
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.metrics import precision_recall_fscore_support
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.model_selection import train_test_split
|
|
from imblearn.over_sampling import RandomOverSampler
|
|
def print_prec(y_test, y_pred):
|
|
print(f'y Test Label: {np.array([int(x) for x in y_test])}')
|
|
print(f'y Prediction: {np.array([int(x) for x in y_pred])}')
|
|
|
|
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
|
|
acc = accuracy_score(y_test,y_pred)
|
|
print(f'Precision: {prec * 100:0.1f}')
|
|
print(f'Recall: {rec * 100:0.1f}')
|
|
print(f'Accuracy: {acc*100:0.1f}')
|
|
print(f'F1: {f1 * 100:0.1f}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
fm = pd.read_csv('fm-features.csv')
|
|
d4j = pd.read_csv('d4j-features.csv')
|
|
bdj = pd.read_csv('bdj-features.csv')
|
|
all = pd.read_csv('data-all.absolute-features.csv')
|
|
#X = fm.drop(columns=['LABEL','Time'],axis = 1) # features
|
|
#Y = fm['LABEL'] # labels
|
|
clf = RandomForestClassifier()
|
|
rs= RandomOverSampler(random_state=0)
|
|
Y_all_test=all['LABEL']
|
|
X_all_test=all.drop(columns=['LABEL','Time'],axis = 1)
|
|
X_train, X_test, y_train, y_test = train_test_split(X_all_test, Y_all_test, test_size=0.2)
|
|
#print(X_train)
|
|
X_resampled, y_resampled = rs.fit_resample(X_train, y_train)
|
|
clf.fit(X_resampled,y_resampled)
|
|
|
|
#clf.fit(X, Y)
|
|
# Y_bdj_test=bdj['LABEL']
|
|
# X_bdj_test=bdj.drop(columns=['LABEL','Time'],axis = 1)
|
|
# Y_d4j_test=d4j['LABEL']
|
|
# X_d4j_test=d4j.drop(columns=['LABEL','Time'],axis = 1)
|
|
# # bdj_pred=clf.predict(X_bdj_test)
|
|
# # d4j_pred=clf.predict(X_d4j_test)
|
|
all_pred=clf.predict(X_test)
|
|
# # print_prec(Y_bdj_test,bdj_pred)
|
|
# # print_prec(Y_d4j_test,d4j_pred)
|
|
print_prec(y_test,all_pred)
|
|
# 800 667 667 727
|
|
joblib.dump(clf, "./random_forest.joblib")
|
|
importance=[0.04336795, 0.11223876, 0.0586915, 0.03840793, 0.07281997, 0.06005508,
|
|
0.08044522, 0.05864273, 0.07693864, 0.08805545, 0.05911849, 0.09307329,
|
|
0.0981109, 0.06003408,]
|
|
features = X_all_test.columns
|
|
print(features)
|
|
print(importance)
|
|
print(clf.feature_importances_)
|
|
# X_train, X_test, y_train, y_test = train_test_split(X_bdj_test, Y_bdj_test, test_size=0.33, random_state=42)
|
|
# clf.fit(X_train,y_train)
|
|
#
|
|
# bdj_pred=clf.predict(X_test)
|
|
# d4j_pred=clf.predict(X_d4j_test)
|
|
# all_pred=clf.predict(X_all_test)
|
|
# print_prec(y_test,bdj_pred)
|
|
# print_prec(Y_d4j_test,d4j_pred)
|
|
# print_prec(Y_all_test,all_pred)
|
|
#
|
|
#
|
|
# X_train, X_test, y_train, y_test = train_test_split(X_d4j_test, Y_d4j_test, test_size=0.33, random_state=42)
|
|
# clf.fit(X_train,y_train)
|
|
#
|
|
# bdj_pred=clf.predict(X_bdj_test)
|
|
# d4j_pred=clf.predict(X_test)
|
|
# all_pred=clf.predict(X_all_test)
|
|
# print_prec(Y_bdj_test,bdj_pred)
|
|
# print_prec(y_test,d4j_pred)
|
|
# print_prec(Y_all_test,all_pred)
|
|
|
|
# print(bdj.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])
|
|
# print(d4j.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])
|
|
# print(fm.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])
|