# Module: jess_obesity_dataset_RandomForestRegressor.py
# Topic: Carry out RandomForestRegressor modeling on ObesityDataSet_cleaned_and_data_sinthetic.csv
# Date: 13 Jul, 2024
# Imports
import os
os.system(“cls”)
import sys
import time
import datetime
import numpy as np
import pandas as pd
import array
pd.set_option(‘show.precision’, 4)
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import traceback as tb
import warnings
warnings.filterwarnings(‘ignore’)
#From / Imports
from pandas import DataFrame
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
# printing capabilities
def print_program_runtime(start_time):
“””print_program_runtime: prints out the operating time of this system
Args:
start_time (time[float]): recorded begin time to make use of to find out run time
“””
end_time = time.time()
diff_time = end_time – start_time
end result = time.strftime(“%H:%M:%S”, time.gmtime(diff_time))
print(“program runtime: {}”.format(end result))
def print_cv_scores (hyperparameter_set_label, cv_scores_array):
“””print_cv_scores: prints sorted cross validation scores for hyperparameter_set_label
Args:
hyperparameter_set_label (string): Label to make use of to point hyperparameter set
cv_scores_array (array): Array of cross validation scores
“””
sort_list = cv_scores_array.tolist()
sort_list.kind()
sort_array = np.kind(sort_list)
print (f” Cross Validation For {hyperparameter_set_label}:”)
print (f” Imply CV Rating : {cv_scores_array.imply()}”)
print (f” Stan Dev CV Rating : {cv_scores_array.std()}”)
print (f” Sorted CV Scores Array : {sort_array}”)
def print_regression_scores (r2_score_value, ar2_score_value, mean_squared_error_array, root_mse):
“””print_regression_scores: prints regression scores primarily based on parameters
Args:
r2_score_value (float64): R2 rating worth
ar2_score_value (float64): Adjusted R2 rating worth
mean_squared_error_array (float64): imply squared error array
root_mse (float64): Root imply squred error
“””
print (” Regression R2 and MSE”)
print (” R2 : “, float(“{0:0.4f}”.format(r2_score_value)))
print (” Adjusted R2 : “, float(“{0:0.4f}”.format(ar2_score_value)))
print (” Imply Squared Error : “, float(“{0:0.8f}”.format(mean_squared_error_array)))
print (” Root Imply Squared Error : “, float(“{0:0.8f}”.format(root_mse)))
print ()
# miscellanious capabilities
def get_directory_strings ():
“””get_directory_strings: returns listing information utilized by program
Returns:
string, string, string, string, string, string:
images_path_folder, csv_path_folder, csv_full_file_name, csv_file_name, csv_preprocessed_file_name
“””
base_path = “C:ProjectsPythonDATA260data_260_pythonsrcCategoricalExperiment”
csv_path_folder = f”{base_path}/csv”
csv_file_name = “ObesityDataSet_cleaned_and_data_sinthetic”
csv_file_extension = “.csv”
csv_full_file_name = csv_file_name + csv_file_extension
images_path_folder = f”{base_path}/photos/{g_date_suffix}”
csv_preprocessed_file_name = f”{csv_path_folder}/{csv_file_name}_preprocessed{csv_file_extension}”
return images_path_folder, csv_path_folder, csv_full_file_name, csv_preprocessed_file_name
# Dataframe capabilities
def get_dataframe_from_csv (csv_path_folder, csv_file_name):
“””get_dataframe_from_csv: Reads the required csv file and returns DataFrame with csv file information
Args:
csv_path_folder (string): absolutely certified folder title to search out the csv file
csv_file_name (string): file title of csv file
Returns:
pandas.DataFrame: DataFrame loaded with contents of csv file
“””
# information body information loading
strive:
csv_path_file = os.path.be part of(csv_path_folder, csv_file_name)
df_return = pd.read_csv(filepath_or_buffer = csv_path_file)
return df_return
besides Exception as e:
print (tb.print_exc)
elevate (e) #We wish this system to fail right here
def get_manual_factorization_dictionary():
“””get_manual_factorization_dictionary: Returns a dictionary of factorized values for every column of the Weight problems information.
Knowledge listed to start out at 1.
Returns:
listing(str,listing[(str,int),(str,int),…]): returns an inventory of [“Col Name”, [(“string value”,factorized int),(“string value”,factorized int),…]]
“””
# basic factorization settings listing
YN_factorization = [(“no”, 1), (“yes”, 2)]
NoSoFrAl_factorization = [(“no”, 1), (“sometimes”, 2), (“frequently”, 3), ( “always”, 4)]
NeSoAl_factorization = [(“never”, 1), (“sometimes”, 2), (“always”, 3)]
# factorization settings for categorical information
Gender_factorization = [“Gender”, [(“male”, 1), (“female”, 2)]]
FHWO_factorization = [“FHWO”, YN_factorization]
FAVC_factorization = [“FAVC”, YN_factorization]
FCVC_factorization = [“FCVC”, NeSoAl_factorization]
CAEC_factorization = [“CAEC”, NoSoFrAl_factorization]
SMOKE_factorization = [“SMOKE”, YN_factorization]
CH2O_factorization = [“CH2O”, [(“less than a liter”, 1), (“between 1 and 2 l”, 2 ), (“more than 2 l”, 3)]]
SCC_factorization = [“SCC”, YN_factorization]
FAF_factorization = [“FAF”, [(“0”, 1), (“1 to 2”, 2), (“2 to 4”, 3), (“4 to 5”, 4)]]
TUE_factorization = [“TUE”, [(“0 to 2”, 1), (“3 to 5”, 2), (“>5”,2)]]
CALC_factorization = [“CALC”, NoSoFrAl_factorization]
MTRANS_factorization = [“MTRANS”
, [(“automobile”, 1), (“motorbike”, 2), (“public_transportation”, 3), (“bike”, 4), (“walking”,5)]
]
NObeyesdad_factorization = [“NObeyesdad”, [ (“insufficient_weight”, 1) , (“normal_weight”, 2) , (“overweight_level_i”, 3)
, (“overweight_level_ii”, 4) , (“obesity_type_i”, 5) , (“obesity_type_ii”, 6)
, (“obesity_type_iii”, 7)
]
]
return_dictionary = [Gender_factorization, FHWO_factorization, FAVC_factorization
, FCVC_factorization, CAEC_factorization, SMOKE_factorization
, CH2O_factorization, SCC_factorization, FAF_factorization
, TUE_factorization, CALC_factorization, MTRANS_factorization
, NObeyesdad_factorization]
return return_dictionary
def manual_factorization (df, manual_factorization_dictionary, verbose=0):
“””manual_factorization: Performs guide factorization utilizing the dictionary from get_manual_factorization_dictionary
Args:
df (str and numeric): DataFrame holding weight problems information
Returns:
DataFrame: returns factorized weight problems information in DataFrame
“””
return_df = pd.DataFrame.copy(df)
if (verbose > 0):
print (f”Performaing Guide Factorization”)
for factor_column in manual_factorization_dictionary:
factor_title = factor_column[0]
factor_dict = factor_column[1]
if (verbose > 0):
print (f”Column {factor_title}:”)
print (f” Worth Dictionary: {factor_dict}”)
for factor_values in factor_dict:
return_df[factor_title].change(factor_values[0], factor_values[1], inplace=True)
if (verbose > 0):
print (f” Changed : {str(factor_values[0]).rjust(21)}, {str(factor_values[1]).rjust(3)}”)
if (verbose > 0):
print()
return return_df
def drop_features (df, drop_features_list):
“””drop_features: returns copy of df with options in drop_features_list dropped
Args:
df (pandas.DataFrame): DataFrame to repeat and drop options from
drop_features_list (listing<string>): Checklist of function names to drop
Returns:
pandas.DataFrame: copy of df with drop_features_list columns eliminated
“””
return_df = DataFrame.copy(df)
for function in drop_features_list:
return_df = return_df.drop(function, axis=1)
return return_df
def preprocess_and_save_obesity_dataset (df, csv_preprocessed_file_name, verbose = 0, factorize=True):
“””preprocess_and_save_obesity_dataset: Performs processing of Dataframe to arrange for ML
Args:
df (pandas.Dataframe): Dataframe holding weight problems information to be processed
verbose (int32): [default 0] debug worth indicating the quantity of output to provide
factorize (bool): [default True] flag indicating whether or not returned dataframe will probably be factorized
Returns:
pandas.Dataframe: Processed Dataframe
“””
strive:
return_df = df.copy(deep=True)
return_df.drop(‘id’, axis=1, errors=’ignore’)
if (verbose > 0):
print(“Authentic Dataset Data”)
print(return_df.information())
# Dropping ‘id’
if (‘id’) in return_df.columns:
if (verbose > 0):
print(“Eradicating id column”)
return_df.drop(‘id’, axis=1, inplace=True)
# Altering family_history_with_overweight to FWHO
if (‘family_history_with_overweight’ in return_df.columns):
if (verbose > 0):
print(“Renaming family_history_with_overweight to FHWO”)
return_df.rename(columns={‘family_history_with_overweight’: ‘FHWO’}, inplace=True)
# Including BMI information if wanted
if ‘BMI’ not in return_df.columns:
if (verbose > 0):
print (“Calculating BMI information and including BMI column. BMI = Weight / squared(Peak)”)
return_df[‘BMI’]=return_df[‘Weight’]/((return_df[‘Height’]) ** 2)
# convert all categorical columns to numeric
if (factorize == True):
manual_factorization_dictionary = get_manual_factorization_dictionary()
return_df = manual_factorization(return_df, manual_factorization_dictionary)
# changing age information to integer, to match authentic information
if (verbose > 0):
print(“Correcting Age Values Again to Integer Values (Sythetic Knowledge Added Non-Integer Values)”)
return_df[‘Age’] = return_df[‘Age’].spherical(0).astype(int)
count_nan = return_df.isna().sum().sum()
# examine for NaNs and take away
if (verbose > 0):
print(return_df.information())
print (f”Variety of NaN : {count_nan}”)
if count_nan > 0:
if (verbose > 0):
print (“Dropping NaN”)
return_df.dropna(inplace=True)
return_df.reset_index(drop=True, inplace=True)
else:
if (verbose > 0):
print (“No NaN discovered.”)
# eradicating duplicates
count_dup = return_df.duplicated().sum()
if (verbose > 0):
print (f”Variety of Duplicates : {count_dup}”)
if count_dup > 0:
if (verbose > 0):
print (“Dropping Duplicates”)
return_df.drop_duplicates(inplace=True)
return_df.reset_index(drop=True, inplace=True)
else:
if (verbose > 0):
print (“No duplicates discovered.”)
# save file if it doesn’t exist
if (os.path.isfile(csv_preprocessed_file_name) == False):
DataFrame.to_csv(df, path_or_buf=csv_preprocessed_file_name, index=False)
else:
print (f”File {csv_preprocessed_file_name} discovered. Not saving csv.”)
return return_df
besides Exception as e:
print (tb.print_exc())
elevate (e) #We wish this system to fail right here
# Factories
def hyperparameter_factory(criterion_type):
“”” hyperparameter_factory: Returns hyperparameter settings used to run mannequin
Args:
criterion_type (string): String describing the hyperparameter set to return
Returns:
hyperparameter_title (string): title to make use of for hyperparameter set
hyperparameter_desc (string): description to make use of for hyperparameter set
hyperparameter_values (set): hyperparameter settings to make use of for regressor
“””
check_this = criterion_type.decrease()
if (check_this == ‘poisson’):
# earlier hyperparameter settings
# ‘bootstrap’ : False, ‘criterion’ :’poisson’, ‘max_depth’ : 90, ‘max_features’ : 3, ‘min_samples_leaf’ : 1, ‘min_samples_split’ : 6, ‘n_estimators’ : 160
# ‘bootstrap’ : False, ‘criterion’ :’poisson’, ‘max_depth’ : 20, ‘max_features’ : 2, ‘min_samples_leaf’ : 1, ‘min_samples_split’ : 4, ‘n_estimators’ : 410
# ‘bootstrap’ : True, ‘criterion’ : ‘poisson’, ‘max_depth’ : 30, ‘max_features’ : 2, ‘min_samples_leaf’ : 1, ‘min_samples_split’ : 4, ‘n_estimators’ : 260
hyperparameter_title = f”Poisson”
hyperparameter_desc = “bootstrap=True, criterion=’poisson’, max_depth=30, max_features=2, min_samples_leaf=1, min_samples_split=4, n_estimators=260”
hyperparameter_values = {
‘bootstrap’ : True,
‘criterion’ : ‘poisson’,
‘max_depth’ : 30,
‘max_features’ : 2,
‘min_samples_leaf’ : 1,
‘min_samples_split’ : 4,
‘n_estimators’ : 260
}
elif (check_this == ‘squared_error’):
# earlier hyperparameter settings
# ‘bootstrap’: True, ‘criterion’: ‘squared_error’, ‘max_features’: r, ‘n_estimators’: 150
# ‘bootstrap’: True, ‘criterion’: ‘squared_error’, ‘max_features’: 4, ‘n_estimators’: 50
# ‘bootstrap’: True, ‘criterion’: ‘squared_error’, ‘max_features’: ‘sqrt’, ‘n_estimators’: 250
hyperparameter_title = f”Squared_Error”
hyperparameter_desc = “bootstrap=True, criterion=’squared_error’, max_features=’sqrt’, n_estimators=250”
hyperparameter_values = {
‘bootstrap’ : True,
‘criterion’ : ‘squared_error’,
‘max_features’ : ‘sqrt’,
‘n_estimators’ : 250
}
else:
hyperparameter_title = None
hyperparameter_desc = None
hyperparameter_values = None
return hyperparameter_title, hyperparameter_desc, hyperparameter_values
def DataFrame_factory(df, dataframe_type):
“””DataFrame_factory: Returns dataframe holding information chosen from df primarily based on dataframe_type
Args:
df (pandas.DataFrame): DataFrame to get information from
dataframe_type (string): string describing the column set sought
Returns:
pandas.DataFrame: DataFrame holding information from df as specified by dataframe_type
“””
# Create dataframe to return
return_df = pd.DataFrame.copy(df)
check_this = dataframe_type.decrease()
related_features = None
all_features = [“Gender”, “Age”, “Height”, “Weight”, “FHWO”, “FAVC”
, “FCVC”, “NCP”, “CAEC”, “SMOKE”, “CH2O”, “SCC”, “FAF”
, “TUE”, “CALC”, “MTRANS”, “NObeyesdad”, “BMI”
]
# set related_factors to incorporate options for returned dataframe
if (check_this == “all”):
related_features = all_features
if (check_this == “categorical”):
related_features = [“Gender”, “FHWO”, “FAVC”, “FCVC”, “NCP”, “CAEC”
, “SMOKE”, “CH2O”, “SCC”, “FAF”
, “TUE”, “CALC”, “MTRANS”, “BMI”
]
if (check_this == “weight-reduction plan”):
related_features = [“FAVC”, “FCVC”, “NCP”, “CAEC”, “CALC”, “CH2O”, “BMI”]
if (check_this == “train”):
related_features = [“FAF”, “TUE”, “MTRANS”, “BMI”]
if (check_this == “behavior”):
related_features = [“SCC”, “SMOKE”, “BMI”]
if (check_this == “genetic”):
related_features = [“Gender”, “FHWO”, “BMI”]
# take away undesirable options
if (related_features != None):
for consider all_features:
if issue not in related_features:
if consider return_df.columns:
return_df = return_df.drop(issue, axis=1)
else:
return_df = None
return return_df
def run_regressor (use_this_regressor, X, y):
“””run_regressor: Runs the provided regressor, and returns scores and prediction and coaching information
Args:
use_this_regressor (sklearn.ensemble.regressor): Regressor to run towards X, y information
X (pandas.DataFrame): X function matrix
y (pansas.Collection): y function sequence
Returns:
sklearn.ensemble.regressor: use_this_regressor,
float64: r2_score_value,
float64: ar2_score_value,
float64: mean_squared_error_array,
float64: root_mse,
DataFrame: X_train,
FataFrame: X_test,
Collection: y_train,
Collection: y_test,
Collection: y_predicted
“””
strive:
# information break up: 80% for coaching, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
# mannequin becoming
use_this_regressor.match(X_train, y_train)
# y prediction
y_predicted = use_this_regressor.predict(X_test)
# calculate regression r2 and imply squared error scores
r2_score_value = r2_score(y_test, y_predicted)
ar2_score_value = 1-(1-r2_score_value)*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mean_squared_error_array = mean_squared_error(y_test, y_predicted, squared=True)
root_mse = mean_squared_error(y_test, y_predicted, squared=False)
return use_this_regressor, r2_score_value, ar2_score_value, mean_squared_error_array
, root_mse, X_train, X_test, y_train, y_test, y_predicted
besides Exception as e:
print (tb.print_exc())
elevate (e) #We wish this system to fail right here
def k_folds_cross_validation(model_estimator, X_feature, y_label, k_number_splits, k_random_state=None):
“””k_folds_cross_validation: performs ok folds cross validation on model_estimator
args:
model_estimator (object): object mannequin
X_feature (dataframe): pandas information body
y_label (sequence): pandas information sequence
k_number_splits (int): k-fold variety of splits
k_random_state (int, elective): defaults to none
returns:
cv_scores_array (array): array holding scores of cross validation
“””
cv_scores_array = None
strive:
k_folds = KFold(n_splits=k_number_splits, random_state=k_random_state, shuffle=True)
cv_scores_array = cross_val_score(estimator=model_estimator, X=X_feature, y=y_label, cv=k_folds, n_jobs=-1)
besides Exception as e:
print (tb.print_exc())
elevate (e) #We wish this system to fail right here
return cv_scores_array
# Plot and save photos capabilities
def plot_and_save_correlation_matrix (df, title, file_title, images_path_folder, program_name, use_cmap=”RdBu”):
“””plot_and_save_correlation_matrix: Create plot of correlation matrix and save picture of it to file
Args:
df (pandas.Dataframe): Dataframe to make the correlation matrix of
title (string): Title of correlation picture (seems on picture)
file_title (string): Title of correlation file (areas modified to underscores)
images_path_folder (string): Folder path to save lots of correlation picture file
program_name (string): Title of program (added to picture file title)
use_cmap (string): Default:”RdBu” Coloration map string to make use of for plot
“””
strive:
if (g_plot_images==True):
# calculate and plot correlation matrix
correlation_matrix = df.corr()
# dimension output picture giant sufficient to carry information
if (len(df.columns) < 8 ):
if (len(df.columns) <= 3):
vert_param = 4
horiz_param = 4
else:
horiz_param = vert_param = len(df.columns)
else:
horiz_param = vert_param = len(df.columns)
figsize = (horiz_param, vert_param)
# Set determine dimension
plt.determine(figsize=figsize)
# plot heatmap and titles
sns.heatmap(information=correlation_matrix, annot=True, cmap=use_cmap, vmin=-1.0, vmax=1.0)
plt.title(title)
# create file title
filename = f”{images_path_folder}/{program_name.change(‘ ‘,’_’)}_{file_title.change(‘ ‘,’_’)}{g_date_suffix}.png”
# save file
plt.savefig(filename)
# clear the plt
plt.clf()
plt.cla()
besides Exception as e:
print (tb.print_exc())
elevate (e) #We wish this system to fail right here
def plot_and_save_gini_feature_importance_plot (mannequin, data_frame, title, file_title, images_path_folder, program_name):
“””plot_and_save_gini_feature_importance_plot: Create plot of function significance and save file of picture
Args:
df (pandas.Dataframe): Dataframe to make the correlation matrix of
title (string): Title of correlation picture (seems on picture)
file_title (string): Title of correlation file (areas modified to underscores)
images_path_folder (string): Folder path to save lots of correlation picture file
program_name (string): Title of program (added to picture file title)
“””
# calculate and plot correlation matrix
strive:
if (g_plot_images==True):
# calculate and plot function significance
feature_importance = np.array(mannequin.feature_importances_)
feature_name = np.array(data_frame.columns)
feature_names={‘feature_name’:feature_name, ‘feature_importance’:feature_importance}
df_feature_importances = pd.DataFrame(feature_names)
df_feature_importances.sort_values(by=[“feature_importance”], ascending=False, inplace=True)
# Set determine dimension
plt.determine(figsize=(15,8))
sns.barplot(x=df_feature_importances[‘feature_importance’], y=df_feature_importances[‘feature_name’])
plt.title(title)
plt.xlabel(‘Function Significance’)
plt.ylabel(‘Function Title’)
plt.tight_layout()
# create file title
filename= f”{images_path_folder}/{program_name.change(‘ ‘,’_’)}_{file_title.change(‘ ‘,’_’)}{g_date_suffix}.png”
# save file
plt.savefig(filename)
# clear the plt
plt.clf()
plt.cla()
besides Exception as e:
print (tb.print_exc())
elevate (e) #We wish this system to fail right here
def plot_and_save_all_correlation_matrices(df, images_path_folder, program_name):
“””plot_and_save_all_correlation_matrices: Plots and saves all correlation matrices
Args:
df (pandas.DataFrame): DataFrame holding values for the correlation matrices
images_path_folder (string): File path to retailer the picture information
program_name (string): Title of program for file names
“””
df_all = DataFrame_factory(df, “all”)
df_categorical = DataFrame_factory(df, “categorical”)
df_diet_related = DataFrame_factory(df, “weight-reduction plan”)
df_exercise_related = DataFrame_factory(df, “train”)
df_habit_related = DataFrame_factory(df, “behavior”)
df_genetic_related = DataFrame_factory(df, “genetic”)
sub_dataframes = [(“all”, df_all), (“categorical”, df_categorical), (“diet”, df_diet_related), (“exercise”, df_exercise_related)
, (“habit”, df_habit_related), (“genetic”, df_genetic_related)]
for this_name, this_df in sub_dataframes:
title = f”Correlation Matrix Plotn{this_name.capitalize()} Options + BMI Dataset”
file_title =f”Corr Mat Plot {this_name.capitalize()} Options”
print(f”{this_name.higher()} RELATED DATAFRAME”)
print(this_df)
plot_and_save_correlation_matrix (this_df, title, file_title, images_path_folder, program_name)
def plot_and_save_test_vs_pred_scatter_plot (y_test, y_pred, hyperparameter_set_label
, images_path_folder, program_name
):
“””plot_and_save_test_vs_pred_scatter_plot: plots y true vs y predicted values
Args:
y_test (Collection): Check values
y_pred (Collection): Y predicted values
hyperparameter_set_label (string): label of hyperparameters
images_path_folder (string): path to retailer photos at.
program_name (string): used to generate a part of the file title for photos.
“””
strive:
if (g_plot_images==True):
# Scatter plot of the true values vs. predicted values
plt.scatter(y_test, y_pred)
# set plot labels, title
plt.xlabel(“True Values (BMI)”, fontsize=15)
plt.ylabel(“Predicted Values (BMI)”, fontsize=15)
file_title = f”True vs Predicted BMI Values – Parameters {hyperparameter_set_label}”
plt.title(file_title)
# True vs Predicted line
plt.plot(np.linspace(min(y_test), max(y_test))
, np.linspace(min(y_pred), max(y_pred))
, c=”purple”, linestyle=’strong’, label=”True vs Predicted Line”)
# match line for True vs Predicted
plt.plot(np.distinctive(y_test), np.poly1d(np.polyfit(y_test, y_pred, 1))(np.distinctive(y_test))
, c=”blue”, linestyle=”strong”, label=”Match Line”)
# comparability line for equality
plt.plot(np.linspace(min(np.distinctive(y_test)), max(np.distinctive(y_test)))
, np.linspace(min(np.distinctive(y_test)), max(np.distinctive(y_test)))
, c=”black”, linestyle=”dashed”, label=”if y_test=y_pred”)
plt.legend()
# create file title
filename = images_path_folder + “/” + program_name.change(‘ ‘,’_’)
+ “_” + hyperparameter_set_label + “_”
+ file_title.change(‘ ‘,’_’).change(‘n’,’_’)
+ g_date_suffix + “.png”
# save file
plt.savefig(filename)
# clear the plt
plt.clf()
plt.cla()
besides Exception as e:
print (tb.print_exc)
elevate (e) #We wish this system to fail right here
def plot_and_save_categorical_scatter_plots (y_test, y_pred, X_test, hyperparameter_set_label, file_title
, images_path_folder, program_name
):
“””plot_and_save_categorical_scatter_plots: Plots scatter plots and saves picture information in specified folder
Args:
y_test (Collection): Check y values
y_pred (Collection): Predicted y values
X_test (DataFrame): Check information DataFrame
hyperparameter_set_label (string): label for hyperparameter settings
file_title (string): title to make use of to generate file title.
images_path_folder (string): path to retailer photos at.
program_name (string): used to generate a part of the file title for photos.
“””
strive:
if (g_plot_images==True):
for col_name in X_test:
this_X_test = X_test[col_name]
# Plotting match line
X_ticks = np.distinctive(this_X_test)
this_min = np.min(X_ticks)-0.5
this_max = np.max(X_ticks)+0.5
plt.xticks = X_ticks
ax = plt.determine().gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_xlim([this_min, this_max])
# Plot True values bigger than predicted values in much less vibrant colour
plt.scatter(this_X_test, y_test, colour=’blue’, label=’True Values’, s=24)
# Plot Predicted values plotted smaller than True values in additional vibrant colour
plt.scatter(this_X_test, y_pred, colour=’purple’, label=’Predicted Values’, s=8)
# Axes labels
plt.xlabel(f”{col_name} Values”)
plt.ylabel(“BMI values”)
plt.title(“Function Values vs Goal Variable”)
plt.plot(np.distinctive(this_X_test), np.poly1d(np.polyfit(this_X_test, y_pred, 1))(np.distinctive(this_X_test))
, c=”black”, linestyle=”dotted”, label =”Match Line”)
# Create file title
file_title = f”{hyperparameter_set_label} X_test[{col_name}] vs Check & Predicted Values”
filename = images_path_folder + “/” + program_name.change(‘ ‘,’_’) + “_”
+ file_title.change(‘ ‘,’_’).change(‘n’,’_’) + g_date_suffix + “.png”
plt.legend()
# Save plot in filename
plt.savefig(filename)
# Clear plot object
plt.clf()
plt.cla()
besides Exception as e:
print (tb.print_exc())
elevate (e) #We wish this system to fail right here
def principal():
“””jess_obesity_dataset_FullTreatment_RFR_ResearchPaper.py
“””
program_name = “Weight problems RandomForestRegressor”
print(program_name)
print()
images_path_folder, csv_path_folder, csv_full_file_name, csv_preprocessed_file_name =
get_directory_strings()
if (False==os.path.exists(images_path_folder)):
os.mkdir(images_path_folder)
np.set_printoptions(formatter={‘float’: ‘{: 0.4f}’.format})
# information body information loading
df = get_dataframe_from_csv (csv_path_folder, csv_full_file_name)
# course of information body
df = preprocess_and_save_obesity_dataset(df, csv_preprocessed_file_name)
print (“submit preprocess df”)
print (df)
print ()
plot_and_save_all_correlation_matrices (df, images_path_folder, program_name)
print()
print()
# print preprocessed dataset metadata
print(df)
print()
# y goal
y = df[“BMI”]
# X options
drop_features_list = [“NObeyesdad”, “Weight”, “Height”, “Age” ]
X = drop_features (df, drop_features_list)
print(X)
# Save Correlation matrix of simply categorical information + BMI
title = “Correlation Matrix Plot + BMInRemoved Age, Weight, Peak, NObeyesdad”
file_title = “Correlation Matrix Plot with BMI included”
plot_and_save_correlation_matrix (X, title, file_title
, images_path_folder, program_name)
# Drop y column
X = X.drop(“BMI”, axis=1)
title = “Correlation Matrix Plot – BMInRemoved Age, Weight, Peak, NObeyesdad”
file_title = “Correlation Matrix Plot with BMI eliminated”
plot_and_save_correlation_matrix (X, title, file_title
, images_path_folder, program_name)
print ()
print (“X Columns:”)
print ()
print (X)
# Making listing of hyperparameters settings
hparms_list = listing()
hparms_list.append( hyperparameter_factory (“Squared_Error”) )
hparms_list.append( hyperparameter_factory (“Poisson”) )
# Cycle by way of the listing of hyperparameter settings
item_index = 1
for merchandise in hparms_list:
use_this_regressor = RandomForestRegressor (**merchandise[2])
print ()
print (f” Testing hyperparameters set {item_index}: {merchandise[0]}”)
print (f” Hyperparameter set {item_index}: {merchandise[2]}”)
cv_scores_array = k_folds_cross_validation(use_this_regressor, X, y, 13)
print_cv_scores (merchandise[0], cv_scores_array)
print ()
# Run the regression
plot_this_regressor, this_r2_score_value, this_ar2_score_value
, this_mean_squared_error_array, this_root_mse, X_train, X_test, y_train, y_test, y_pred =
run_regressor (use_this_regressor, X, y)
print_regression_scores (this_r2_score_value, this_ar2_score_value
, this_mean_squared_error_array, this_root_mse)
# Plot check vs predicted scatter plot
plot_and_save_test_vs_pred_scatter_plot (y_test=y_test, y_pred=y_pred, hyperparameter_set_label=merchandise[0]
, images_path_folder=images_path_folder, program_name=program_name
)
# Plot categorical check vs predicted scatter plots
plot_and_save_categorical_scatter_plots (y_test=y_test, y_pred=y_pred, X_test=X_test, hyperparameter_set_label=merchandise[0]
, file_title=f”{merchandise[0]} True vs Predicted BMI Values”
, images_path_folder=images_path_folder, program_name=program_name
)
# Plot and save gini function significance plot
use_this_title = f”RandomForestRegressor Function Significance Plot: {merchandise[0]}”
use_this_file_title = f”{merchandise[0]} RandomForestRegressor Function Significance Plot”
plot_and_save_gini_feature_importance_plot (plot_this_regressor
, X, use_this_title, use_this_file_title
, images_path_folder, program_name)
item_index += 1
print()
print()
if __name__ == ‘__main__’:
g_plot_images = True
start_time = time.time()
st = datetime.datetime.now()
ds_month=f”{st.month}”.zfill(2)
ds_day=f”{st.day}”.zfill(2)
ds_hour=f”{st.hour}”.zfill(2)
ds_minute=f”{st.minute}”.zfill(2)
ds_second=f”{st.second}”.zfill(2)
g_date_suffix = f”_{st.12 months}_{ds_month}_{ds_day}_{ds_hour}_{ds_minute}_{ds_second}”
principal()
print_program_runtime(start_time)