home > kero > Machine Learning
DNN Regressor
- Part 1. Synthetic Regression
- Part 2.1. Synthetic Regression. Train and save the model.
- Part 2.2. Synthetic Regression. Load model and predict.
The following codes can be found here in folder DNN regression, python under the name synregMVar_cont.ipynb (Jupyter notebook) and synregMVar_cont.py format.
Continuing from part 1 in this link, we load the model saved under output1 and output2 for prediction. It will perform badly since it is trained with a small number of data points. We will then train it further with more data points and perform better prediction.
import kero.DataHandler.RandomDataFrame as RDF
import kero.DataHandler.DataTransform as dt
from kero.DataHandler.Generic import *
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from scipy.stats.stats import pearsonr
from pylab import rcParams
In this code, make sure the number of layers and hidden units are the same as the values used in the first round of training. Likewise, make sure the activation functions are the same as well. The number of steps for training in the later part of the code can be set a lot higher.
hiddenunit_set=[[32,16,8],[32,16,8]]
step_set= [2400, 2400] # [6400,6400] # [None, None]
activationfn_set=[tf.nn.relu, tf.nn.relu]
no_of_new_training_set=2000
new_test_size_frac = 0.5
rdf = RDF.RandomDataFrame()
####################################################
# Specify the input variables here
####################################################
FEATURES = ["first","second","third", "fourth","bool1", "bool2", "bool3", "bool4"]
output_label="output1" # !! List all the output column names
output_label2= "output2"
col1 = {"column_name": FEATURES[0], "items": list(range(4))}
col2 = {"column_name": FEATURES[1], "items": list(np.linspace(10, 20, 8))}
col3 = {"column_name": FEATURES[2], "items": list(np.linspace(-100, 100, 1250))}
col4 = {"column_name": FEATURES[3], "items": list(np.linspace(-1, 1, 224))}
col5 = {"column_name": FEATURES[4], "items": [0, 1]}
col6 = {"column_name": FEATURES[5], "items": [0, 1]}
col7 = {"column_name": FEATURES[6], "items": [0, 1]}
col8 = {"column_name": FEATURES[7], "items": [0, 1]}
LABEL = [output_label, output_label2]
In the following code we load the training data set from part 2.1, drop all the defective data points, split it into training part (20 data points) and test part (980 data points), similar to, but not necessarily the same as, part 2.1.
df_train = pd.read_csv(r"regressionMVartest_train.csv")
print('df train shape =',df_train.shape)
# print("train data:\n", df_train.head())
cleanD_train, crippD_train, _ = dt.data_sieve(df_train) # cleanD, crippD, origD'
cleanD_train.get_list_from_df()
colname_set_train = df_train.columns
df_train_clean = cleanD_train.clean_df
df_train_crippled = crippD_train.crippled_df
print("clean train data:\n", df_train.head())
print('df train clean shape =',df_train_clean.shape)
if df_train_crippled is not None:
print('df train crippled shape =',df_train_crippled.shape)
else:
print('df train: no defect')
# prepare
train = df_train_clean[:]
print(FEATURES," -size = ", len(FEATURES))
# Columns for tensorflow
feature_cols = [tf.contrib.layers.real_valued_column(k) for k in FEATURES]
# Training set and Prediction set with the features to predict
training_set = train[FEATURES]
prediction_set = train[LABEL]
# Train and Test
x_train, x_test, y_train, y_test = train_test_split(training_set[FEATURES] , prediction_set, test_size=0.98, random_state=42)
y_train = pd.DataFrame(y_train, columns = LABEL)
training_set = pd.DataFrame(x_train, columns = FEATURES).merge(y_train, left_index = True, right_index = True)
# Training for submission
training_sub = training_set[FEATURES]
print(training_set.head()) # NOT YET PREPROCESSED
# Same thing but for the test set
y_test = pd.DataFrame(y_test, columns = LABEL)
testing_set = pd.DataFrame(x_test, columns = FEATURES).merge(y_test, left_index = True, right_index = True)
print(testing_set.head()) # NOT YET PREPROCESSED
print("training size = ", training_set.shape)
print("test size = ", testing_set.shape)
Then we do pre-processing. Once done, we are ready to feed these pre-processed data into the model for prediction.
range_second = [10,20]
range_third = [-100,100]
range_fourth = [-1,1]
# range_input_set = [range_second, range_third, range_fourth]
range_output1 = [-200,200]
range_output2 = [-600,600]
range_output_set = {'output1': range_output1, 'output2': range_output2}
conj_command_set = {FEATURES[0]: "",
FEATURES[1]: "cont_to_scale",
FEATURES[2]: "cont_to_scale",
FEATURES[3]: "cont_to_scale",
FEATURES[4]: "",
FEATURES[5]: "",
FEATURES[6]: "",
FEATURES[7]: "",
# OUTPUT
LABEL[0]: "cont_to_scale",
LABEL[1]: "cont_to_scale",
}
scale_output1 = [0,1]
scale_output2 = [0,1]
scale_output_set = {'output1': scale_output1, 'output2': scale_output2}
cont_to_scale_settings_second = {"scale": [-1, 1], "mode": "uniform", "original_scale":range_second}
cont_to_scale_settings_third = {"scale": [0, 1], "mode": "uniform", "original_scale":range_third}
cont_to_scale_settings_fourth = {"scale": [0, 1], "mode": "uniform", "original_scale":range_fourth}
cont_to_scale_settings_output1 = {"scale": scale_output1 , "mode": "uniform", "original_scale":range_output1}
cont_to_scale_settings_output2 = {"scale": scale_output2 , "mode": "uniform", "original_scale":range_output2}
conj_command_setting_set = {FEATURES[0]: None,
FEATURES[1]: cont_to_scale_settings_second,
FEATURES[2]: cont_to_scale_settings_third,
FEATURES[3]: cont_to_scale_settings_fourth,
FEATURES[4]: None,
FEATURES[5]: None,
FEATURES[6]: None,
FEATURES[7]: None,
# OUTPUT
LABEL[0]: cont_to_scale_settings_output1,
LABEL[1]: cont_to_scale_settings_output2,
}
# Model
tf.logging.set_verbosity(tf.logging.ERROR)
regressor_set = []
for i in range(len(LABEL)):
regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
activation_fn = activationfn_set[i], hidden_units=hiddenunit_set[i],
model_dir=LABEL[i])
# optimizer = tf.train.GradientDescentOptimizer(learning_rate= 0.1))
regressor_set.append(regressor)
# Reset the index of training
training_set.reset_index(drop = True, inplace =True)
def input_fn(data_set, one_label, pred = False):
# one_label is the element of LABEL
if pred == False:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
labels = tf.constant(data_set[one_label].values)
return feature_cols, labels
if pred == True:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
return feature_cols
# Conjugate
cleanD_testing_set = dt.clean_data()
cleanD_testing_set.clean_df = testing_set
cleanD_testing_set.build_conj_dataframe(conj_command_set, conj_command_setting_set=conj_command_setting_set)
test_conj = cleanD_testing_set.clean_df_conj[:]
print("scaled test=\n", test_conj.head(10))
Loading Model for Prediction
Then we perform the prediction on 980 data points that we just created.
# Evaluation on the test set created by train_test_split
print("Final Loss on the testing set: ")
predictions_prev_set = []
for i in range(len(LABEL)):
ev = regressor_set[i].evaluate(input_fn=lambda: input_fn(test_conj,LABEL[i]), steps=1)
loss_score1 = ev["loss"]
print(LABEL[i],"{0:f}".format(loss_score1))
# Predictions
y = regressor_set[i].predict(input_fn=lambda: input_fn(test_conj,LABEL[i]))
predictions_prev = list(itertools.islice(y, test_conj.shape[0]))
predictions_prev_set.append(predictions_prev)
print("predictions_prev_set length = ",len(predictions_prev_set))
corrcoeff_set = []
predictions_set = []
reality_set = []
print("pearson correlation coefficients = ")
for i in range(len(LABEL)):
# print(LABEL[i]," : ",init_scale_max ,init_scale_min)
# need to inverse transform #
# since prediction is in conj form
initial_scale = [range_output_set[LABEL[i]][0],range_output_set[LABEL[i]][1]]
orig_scale = [scale_output_set[LABEL[i]][0],scale_output_set[LABEL[i]][1]]
pred_inv = dt.conj_from_cont_to_scaled(predictions_prev_set[i], scale=initial_scale, mode="uniform",original_scale=orig_scale)
#############################
predictions = pd.DataFrame(pred_inv,columns = ['Prediction'])
predictions_set = predictions_set + [pred_inv] # a list, or column
# predictions.head()
reality = testing_set[LABEL[i]].values # a list, or column
reality_set = reality_set + [reality]
# reality.head()
corrcoeff=pearsonr(list(predictions.Prediction), list(reality))
corrcoeff_set.append(corrcoeff)
print(LABEL[i], " : ", corrcoeff)
matplotlib.rc('xtick', labelsize=20)
matplotlib.rc('ytick', labelsize=20)
for i in range(len(LABEL)):
fig, ax = plt.subplots()
# plt.style.use('ggplot')
plt.scatter(predictions_set[i], reality_set[i],s=3, c='r', lw=0) # ,'ro'
plt.xlabel('Predictions', fontsize = 20)
plt.ylabel('Reality', fontsize = 20)
plt.title('Predictions x Reality on dataset Test: '+LABEL[i], fontsize = 20)
plt.plot([reality_set[i].min(), reality_set[i].max()], [reality_set[i].min(), reality_set[i].max()], 'k--', lw=2)

As shown above, the prediction performance is poor.
Further Training
Now, we create more data for further training.
no_of_data_points = [no_of_new_training_set, None] # number of rows for training and testing data sets to be generated.
puncture_rate=0.001
rdf = RDF.RandomDataFrame()
####################################################
# Specify the input variables here
####################################################
FEATURES = ["first","second","third", "fourth","bool1", "bool2", "bool3", "bool4"]
# output_label= # !! List all the output column names
# output_label2='output2'
LABEL = ['output1','output2']
col1 = {"column_name": FEATURES[0], "items": list(range(4))}
col2 = {"column_name": FEATURES[1], "items": list(np.linspace(10, 20, 8))}
col3 = {"column_name": FEATURES[2], "items": list(np.linspace(-100, 100, 1250))}
col4 = {"column_name": FEATURES[3], "items": list(np.linspace(-1, 1, 224))}
col5 = {"column_name": FEATURES[4], "items": [0, 1]}
col6 = {"column_name": FEATURES[5], "items": [0, 1]}
col7 = {"column_name": FEATURES[6], "items": [0, 1]}
col8 = {"column_name": FEATURES[7], "items": [0, 1]}
rdf.initiate_random_table(no_of_data_points[0], col1, col2, col3, col4, col5, col6, col7, col8, panda=True)
# print("clean\n", rdf.clean_df)
df_temp = rdf.clean_df
listform, column_name_list = dt.dataframe_to_list(df_temp)
########################################################
# Specify the system of equations which determines
# the output variables.
########################################################
tempcol = []
tempcol2 = []
gg = listform[:]
column_name_list = list(column_name_list)
########## Specifiy the name(s) of the output variable(s) ##########
column_name_list = column_name_list + LABEL
listform = list(listform)
for i in range(len(listform[0])):
# example 0 (very easy)
# temp = gg[0][i] + gg[1][i] + gg[2][i] + gg[3][i] + gg[4][i] + gg[5][i] + gg[6][i] + gg[7][i]
# temp2 = gg[0][i] - gg[1][i] + gg[2][i] - gg[3][i] + gg[4][i] - gg[5][i] + gg[6][i] - gg[7][i]
# example 1
temp = gg[0][i]**2 + gg[1][i] + gg[2][i] + (gg[4][i] + gg[5][i])*gg[3][i] + gg[6][i] + gg[7][i]
temp2 = gg[0][i] - gg[1][i]**2 + gg[2][i] - gg[3][i]*(0.5*(gg[6][i] - gg[7][i])) + gg[4][i] - gg[5][i]
########################################
tempcol = tempcol + [temp]
tempcol2 = tempcol2 + [temp2]
listform = listform + [tempcol, tempcol2]
# for i in range(len(listform)):
# print(column_name_list[i], '-', listform[i])
########################################################
listform = transpose_list(listform)
# print(listform)
# print(column_name_list)
temp_df = pd.DataFrame(listform, columns=column_name_list)
rdf.clean_df = temp_df
# print(rdf.clean_df)
rdf.crepify_table(rdf.clean_df, rate=puncture_rate)
# print("post crepfify\n", rdf.crepified_df)
rdf.crepified_df.to_csv("regressionMVartest_train_more.csv", index=False)
We load the new training set, split them into training and test part (this time 50% each), and perform the pre-processing on the training part of the new training set.
df_train = pd.read_csv(r"regressionMVartest_train_more.csv")
print('df train shape =',df_train.shape)
# print("train data:\n", df_train.head())
cleanD_train, crippD_train, _ = dt.data_sieve(df_train) # cleanD, crippD, origD'
cleanD_train.get_list_from_df()
colname_set_train = df_train.columns
df_train_clean = cleanD_train.clean_df
df_train_crippled = crippD_train.crippled_df
print("clean train data:\n", df_train.head())
print('df train clean shape =',df_train_clean.shape)
if df_train_crippled is not None:
print('df train crippled shape =',df_train_crippled.shape)
else:
print('df train: no defect')
# prepare
dftr=df_train_clean[:]
dftr.head()
train = dftr
print(FEATURES," -size = ", len(FEATURES))
feature_cols = [tf.contrib.layers.real_valued_column(k) for k in FEATURES]
# Training set and Prediction set with the features to predict
training_set = train[FEATURES]
prediction_set = train[LABEL]
# Train and Test
x_train, x_test, y_train, y_test = train_test_split(training_set[FEATURES] , prediction_set, test_size=new_test_size_frac , random_state=42)
y_train = pd.DataFrame(y_train, columns = LABEL)
training_set = pd.DataFrame(x_train, columns = FEATURES).merge(y_train, left_index = True, right_index = True)
# Training for submission
training_sub = training_set[FEATURES]
print(training_set.head()) # NOT YET PREPROCESSED
# Same thing but for the test set
y_test = pd.DataFrame(y_test, columns = LABEL)
testing_set = pd.DataFrame(x_test, columns = FEATURES).merge(y_test, left_index = True, right_index = True)
print(testing_set.head()) # NOT YET PREPROCESSED
print("training size = ", training_set.shape)
print("test size = ", testing_set.shape)
range_second = [10,20]
range_third = [-100,100]
range_fourth = [-1,1]
# range_input_set = [range_second, range_third, range_fourth]
range_output1 = [-200,200]
range_output2 = [-600,600]
range_output_set = {'output1': range_output1, 'output2': range_output2}
conj_command_set = {FEATURES[0]: "",
FEATURES[1]: "cont_to_scale",
FEATURES[2]: "cont_to_scale",
FEATURES[3]: "cont_to_scale",
FEATURES[4]: "",
FEATURES[5]: "",
FEATURES[6]: "",
FEATURES[7]: "",
# OUTPUT
LABEL[0]: "cont_to_scale",
LABEL[1]: "cont_to_scale",
}
scale_output1 = [0,1]
scale_output2 = [0,1]
scale_output_set = {'output1': scale_output1, 'output2': scale_output2}
cont_to_scale_settings_second = {"scale": [-1, 1], "mode": "uniform", "original_scale":range_second}
cont_to_scale_settings_third = {"scale": [0, 1], "mode": "uniform", "original_scale":range_third}
cont_to_scale_settings_fourth = {"scale": [0, 1], "mode": "uniform", "original_scale":range_fourth}
cont_to_scale_settings_output1 = {"scale": scale_output1 , "mode": "uniform", "original_scale":range_output1}
cont_to_scale_settings_output2 = {"scale": scale_output2 , "mode": "uniform", "original_scale":range_output2}
conj_command_setting_set = {FEATURES[0]: None,
FEATURES[1]: cont_to_scale_settings_second,
FEATURES[2]: cont_to_scale_settings_third,
FEATURES[3]: cont_to_scale_settings_fourth,
FEATURES[4]: None,
FEATURES[5]: None,
FEATURES[6]: None,
FEATURES[7]: None,
# OUTPUT
LABEL[0]: cont_to_scale_settings_output1,
LABEL[1]: cont_to_scale_settings_output2,
}
cleanD_training_set = dt.clean_data()
cleanD_training_set.clean_df = training_set
cleanD_training_set.build_conj_dataframe(conj_command_set, conj_command_setting_set=conj_command_setting_set)
train_conj = cleanD_training_set.clean_df_conj[:]
print("scaled train=\n", train_conj.head(10))
We write the model here, perform training, perform pre-processing on the test part of the training set, and predict the outcome of the test part of the training set.
# Model
tf.logging.set_verbosity(tf.logging.ERROR)
regressor_set = []
for i in range(len(LABEL)):
regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
activation_fn = activationfn_set[i], hidden_units=hiddenunit_set[i],
model_dir=LABEL[i])
# optimizer = tf.train.GradientDescentOptimizer(learning_rate= 0.1))
regressor_set.append(regressor)
# Reset the index of training
training_set.reset_index(drop = True, inplace =True)
def input_fn(data_set, one_label, pred = False):
# one_label is the element of LABEL
if pred == False:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
labels = tf.constant(data_set[one_label].values)
return feature_cols, labels
if pred == True:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
return feature_cols
# TRAINING HERE
for i in range(len(LABEL)):
regressor_set[i].fit(input_fn=lambda: input_fn(train_conj, LABEL[i]), steps=step_set[i])
# Conjugate testing part of the training set
cleanD_testing_set = dt.clean_data()
cleanD_testing_set.clean_df = testing_set
cleanD_testing_set.build_conj_dataframe(conj_command_set, conj_command_setting_set=conj_command_setting_set)
test_conj = cleanD_testing_set.clean_df_conj[:]
print("scaled test=\n", test_conj.head(10))
# Evaluation on the test set created by train_test_split
print("Final Loss on the testing set: ")
predictions_prev_set_new = []
for i in range(len(LABEL)):
ev = regressor_set[i].evaluate(input_fn=lambda: input_fn(test_conj,LABEL[i]), steps=1)
loss_score1 = ev["loss"]
print(LABEL[i],"{0:f}".format(loss_score1))
# Predictions
y = regressor_set[i].predict(input_fn=lambda: input_fn(test_conj,LABEL[i]))
predictions_prev = list(itertools.islice(y, test_conj.shape[0]))
predictions_prev_set_new.append(predictions_prev)
print("predictions_prev_set_new length = ",len(predictions_prev_set))
corrcoeff_set_new = []
predictions_set_new = []
reality_set_new = []
print("pearson correlation coefficients = ")
for i in range(len(LABEL)):
# print(LABEL[i]," : ",init_scale_max ,init_scale_min)
# need to inverse transform #
# since prediction is in conj form
initial_scale = [range_output_set[LABEL[i]][0],range_output_set[LABEL[i]][1]]
orig_scale = [scale_output_set[LABEL[i]][0],scale_output_set[LABEL[i]][1]]
pred_inv = dt.conj_from_cont_to_scaled(predictions_prev_set_new[i], scale=initial_scale, mode="uniform",original_scale=orig_scale)
#############################
predictions = pd.DataFrame(pred_inv,columns = ['Prediction'])
predictions_set_new = predictions_set_new + [pred_inv] # a list, or column
# predictions.head()
reality = testing_set[LABEL[i]].values # a list, or column
reality_set_new = reality_set_new + [reality]
# reality.head()
corrcoeff=pearsonr(list(predictions.Prediction), list(reality))
corrcoeff_set.append(corrcoeff)
print(LABEL[i], " : ", corrcoeff)
matplotlib.rc('xtick', labelsize=20)
matplotlib.rc('ytick', labelsize=20)
for i in range(len(LABEL)):
fig2, ax2 = plt.subplots()
# plt.style.use('ggplot')
plt.scatter(predictions_set_new[i], reality_set_new[i],s=3, c='r', lw=0) # ,'ro'
plt.xlabel('Predictions', fontsize = 20)
plt.ylabel('Reality', fontsize = 20)
plt.title('Predictions x Reality on dataset Test: '+LABEL[i], fontsize = 20)
ax2.plot([reality_set_new[i].min(), reality_set_new[i].max()], [reality_set_new[i].min(), reality_set_new[i].max()], 'k--', lw=2)

As shown above, the outcome of the new training shows up better, i.e. closer to the true theoretical values.
Finally, we use the newly trained model to predict the outcome of our test data. First we do pre-processing.
df_test = pd.read_csv(r"regressionMVartest_test.csv")
# print('df test shape =',df_test.shape)
# print("\ntest data", df_test.head())
cleanD_test, crippD_test, _ = dt.data_sieve(df_test)
cleanD_test.get_list_from_df()
colname_set_train = df_train.columns
df_test_clean = cleanD_test.clean_df
df_test_crippled = crippD_test.crippled_df
conj_command_set_test = {FEATURES[0]: "",
FEATURES[1]: "cont_to_scale",
FEATURES[2]: "cont_to_scale",
FEATURES[3]: "cont_to_scale",
FEATURES[4]: "",
FEATURES[5]: "",
FEATURES[6]: "",
FEATURES[7]: "",
}
conj_command_setting_set_test = {FEATURES[0]: None,
FEATURES[1]: cont_to_scale_settings_second,
FEATURES[2]: cont_to_scale_settings_third,
FEATURES[3]: cont_to_scale_settings_fourth,
FEATURES[4]: None,
FEATURES[5]: None,
FEATURES[6]: None,
FEATURES[7]: None,
}
# Same thing (preprocessing) but for the test set
cleanD_test.build_conj_dataframe(conj_command_set_test, conj_command_setting_set=conj_command_setting_set_test)
test_predict_conj = cleanD_test.clean_df_conj[:]
print("scaled test=\n", test_predict_conj.head(10))
print(df_test.shape)
print(test_predict_conj.shape)
Next we print out the prediction on a separate file synregMVar_submission.csv. The result is compared with the true solutions recorded in regressionMVartest_test_correctans.csv generated in part 1.
filename = "synregMVar_submission.csv"
df_test_correct_ans = pd.read_csv(r"regressionMVartest_test_correctans.csv")
y_predict_inv_set = []
for i in range(len(LABEL)):
y_predict = regressor_set[i].predict(input_fn=lambda: input_fn(test_predict_conj , LABEL[i], pred = True))
# need to transform back
y_predict_before = list(itertools.islice(y_predict, df_test.shape[0]))
# !!
initial_scale = [range_output_set[LABEL[i]][0],range_output_set[LABEL[i]][1]]
orig_scale = [scale_output_set[LABEL[i]][0],scale_output_set[LABEL[i]][1]]
y_predict_inv = dt.conj_from_cont_to_scaled(y_predict_before, scale=initial_scale, mode="uniform",original_scale=orig_scale)
y_predict_inv_set = y_predict_inv_set + [y_predict_inv]
fig2, ax2 = plt.subplots()
real_test = np.array(list(df_test_correct_ans[LABEL[i]]))
plt.scatter(y_predict_inv, real_test,s=3, c='r', lw=0) # ,'ro'
plt.xlabel('Predictions', fontsize = 20)
plt.ylabel('Reality', fontsize = 20)
plt.title('Predictions x Reality on dataset Test: '+LABEL[i], fontsize = 20)
ax2.plot([real_test.min(), real_test.max()], [real_test.min(), real_test.max()], 'k--', lw=4)
y_predict_inv_set = transpose_list(y_predict_inv_set)
# print(y_predict_inv_set)
y_predict_for_csv = pd.DataFrame(y_predict_inv_set, columns = LABEL)
y_predict_for_csv.to_csv(filename, index=False)

The prediction is stored in synregMVar_submission.csv.