Files
Azalea (on HyDEV-Daisy) 428d5afc58 [U] Update
2022-10-01 13:34:56 -04:00

82 lines
3.2 KiB
Python

import numpy as np
import pandas as pd
import joblib
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
def print_prec(y_test, y_pred):
print(f'y Test Label: {np.array([int(x) for x in y_test])}')
print(f'y Prediction: {np.array([int(x) for x in y_pred])}')
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
acc = accuracy_score(y_test,y_pred)
print(f'Precision: {prec * 100:0.1f}')
print(f'Recall: {rec * 100:0.1f}')
print(f'Accuracy: {acc*100:0.1f}')
print(f'F1: {f1 * 100:0.1f}')
if __name__ == '__main__':
fm = pd.read_csv('fm-features.csv')
d4j = pd.read_csv('d4j-features.csv')
bdj = pd.read_csv('bdj-features.csv')
all = pd.read_csv('data-all.absolute-features.csv')
#X = fm.drop(columns=['LABEL','Time'],axis = 1) # features
#Y = fm['LABEL'] # labels
clf = RandomForestClassifier()
rs= RandomOverSampler(random_state=0)
Y_all_test=all['LABEL']
X_all_test=all.drop(columns=['LABEL','Time'],axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X_all_test, Y_all_test, test_size=0.2)
#print(X_train)
X_resampled, y_resampled = rs.fit_resample(X_train, y_train)
clf.fit(X_resampled,y_resampled)
#clf.fit(X, Y)
# Y_bdj_test=bdj['LABEL']
# X_bdj_test=bdj.drop(columns=['LABEL','Time'],axis = 1)
# Y_d4j_test=d4j['LABEL']
# X_d4j_test=d4j.drop(columns=['LABEL','Time'],axis = 1)
# # bdj_pred=clf.predict(X_bdj_test)
# # d4j_pred=clf.predict(X_d4j_test)
all_pred=clf.predict(X_test)
# # print_prec(Y_bdj_test,bdj_pred)
# # print_prec(Y_d4j_test,d4j_pred)
print_prec(y_test,all_pred)
# 800 667 667 727
joblib.dump(clf, "./random_forest.joblib")
importance=[0.04336795, 0.11223876, 0.0586915, 0.03840793, 0.07281997, 0.06005508,
0.08044522, 0.05864273, 0.07693864, 0.08805545, 0.05911849, 0.09307329,
0.0981109, 0.06003408,]
features = X_all_test.columns
print(features)
print(importance)
print(clf.feature_importances_)
# X_train, X_test, y_train, y_test = train_test_split(X_bdj_test, Y_bdj_test, test_size=0.33, random_state=42)
# clf.fit(X_train,y_train)
#
# bdj_pred=clf.predict(X_test)
# d4j_pred=clf.predict(X_d4j_test)
# all_pred=clf.predict(X_all_test)
# print_prec(y_test,bdj_pred)
# print_prec(Y_d4j_test,d4j_pred)
# print_prec(Y_all_test,all_pred)
#
#
# X_train, X_test, y_train, y_test = train_test_split(X_d4j_test, Y_d4j_test, test_size=0.33, random_state=42)
# clf.fit(X_train,y_train)
#
# bdj_pred=clf.predict(X_bdj_test)
# d4j_pred=clf.predict(X_test)
# all_pred=clf.predict(X_all_test)
# print_prec(Y_bdj_test,bdj_pred)
# print_prec(y_test,d4j_pred)
# print_prec(Y_all_test,all_pred)
# print(bdj.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])
# print(d4j.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])
# print(fm.corr(method='spearman').sort_values(by=['LABEL'])['LABEL'])