import openpyxl import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn import linear_model, tree import pydotplus inputs_train = [] output_train = [] inputs_test = [] output_test = [] def train_data(): global inputs_train,output_train wb = openpyxl.load_workbook('seedling/seedling_train_data.xlsx') #Import train document which get from R language ws = wb.worksheets[0] last_col = [] #output list nrows = ws.max_row ncols = ws.max_column for i in range(nrows - 1): last_col.append(ws.cell(row=i + 2, column=5).value) #The last column of data is added to the list to get the output array output_train.append(last_col) last_col = [] output_y = np.array(output_train) output_y=output_y.ravel() for row in ws.iter_rows(min_row=2, min_col=1, max_col=ncols - 1, max_row=nrows, values_only=True): inputs_train.append(list(row)) #The first four columns of data are added to the list to get the input array input_x= inputs_train return input_x, output_y def test_data(): global inputs_test, output_test wb = openpyxl.load_workbook('seedling/seedling_test_data.xlsx') #Import test document which get from R language ws = wb.worksheets[0] last_col = [] #output list nrows = ws.max_row ncols = ws.max_column for i in range(nrows - 1): last_col.append(ws.cell(row=i + 2, column=5).value) #The last column of data is added to the list to get the output array output_test.append(last_col) last_col=[] output_y = np.array(output_test) output_y=output_y.ravel() for row in ws.iter_rows(min_row=2, min_col=1, max_col=ncols - 1, max_row=nrows, values_only=True): inputs_test.append(list(row)) #The first four columns of data are added to the list to get the input array input_x = inputs_test return input_x, output_y def establish_model(): input_train_data, output_train_data = train_data() # print(input_train_data) # print(output_train_data) rf = RandomForestRegressor(n_estimators=500, criterion='mse', max_features=4, oob_score=True, n_jobs=-1, max_depth=None) #A random forest regression model was established # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3) rf.fit(input_train_data, output_train_data) #model training # path = rf.cost_complexity_pruning_path(input_train_data, output_train_data) print(rf) input_test_data, output_test_data = test_data() # print(input_test_data) # print(output_test_data) predicts = rf.predict(input_test_data) #model prediction used test data # print(predicts) print('test R^2:', rf.score(input_test_data, output_test_data)) # print(rf.oob_score_) predicts_list = predicts.tolist() with open('seedling/seedling_output_predicts.txt', 'w') as file: #Save the model forecast output to the TXT document,named "output_predicts.txt" for data in predicts_list: file.write(str(data)) file.write('\n') tree_of_rf = rf.estimators_[5] #Gets the fifth decision tree ,a total of 500 names_list = ['Temperature', 'Humidity', 'Illumination', 'RLAI'] # print(tree_of_rf) dot_data = tree.export_graphviz(tree_of_rf, out_file=None, feature_names=names_list, filled=True, rounded=True, impurity=True,node_ids=False) # print(dot_data) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("seedling/seedling_rule_5.pdf") # Get numerical feature importances importances = list(rf.feature_importances_) # List of tuples with variable and importance feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(names_list, importances)] # Sort the feature importances by most important first feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) print(feature_importances) # Print out the feature and importances [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] # list of x locations for plotting x_values = list(range(len(importances))) # Make a bar chart plt.bar(x_values, importances, orientation='vertical') # Tick labels for x axis plt.xticks(x_values, names_list, rotation='vertical') # Axis labels and title plt.ylabel('Importance') plt.xlabel('Variable') plt.title('Variable Importances') plt.show() # return X_train,X_test,y_train,y_test,predicts # def plot(): # X_train,X_test,y_train,y_test,predicts=establish_model() # train_samples=[] # single_sample=[] # test_samples=[] # # print(len(X_train)) # # print(y_train) # for i in range(len(X_train)): # single_sample.append(i+1) # train_samples.append(single_sample) # single_sample=[] # for i in range(len(X_test)): # single_sample.append(i+1) # test_samples.append(single_sample) # single_sample=[] # lw=2 # plt.plot(test_samples, y_test, color='b', label='test scatter') # # plt.plot(test_samples, y_test, color='y', lw=lw, label='test data ') # plt.plot(test_samples,predicts, color='c', lw=lw, label='predict data') # plt.xlabel('data') # plt.ylabel('target') # plt.title('RFR') # plt.legend() # plt.show() # y_test=np.ndarray.tolist(y_test) # a=[] # b=[] # for data in y_test: # a.append(data) # b.append(a) # a=[] # # c=[] # d=[] # predicts=np.ndarray.tolist(predicts) # for data in predicts: # c.append(data) # d.append(c) # c=[] # # print(b) # # print(d) # model=linear_model.LinearRegression() # model.fit(b,d) # print('pre and fact R^2:',model.score(b,d)) if __name__ == '__main__': establish_model()