First, let’s load our dataset and perform preliminary preprocessing.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns# Load the model new dataset
data_path = '/mnt/info/data_california_house.csv'
info = pd.read_csv(data_path)
# Information encoding for categorical attribute
info['ocean_proximity'] = info['ocean_proximity'].change({'ISLAND': 0, 'NEAR OCEAN': 1, '<1H OCEAN': 2, 'NEAR BAY': 3, 'INLAND': 4})
# Take care of missing values by filling them with the median price of each column
info.fillna(info.median(), inplace=True)
# Select choices and aim
X = info.drop('median_house_value', axis=1)
y = info['median_house_value']
# Standardize the choices
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Isolation Forest
from sklearn.ensemble import IsolationForestiso_forest = IsolationForest(contamination=0.1)
outliers_iso = iso_forest.fit_predict(X_scaled)
X_iso = X_scaled[outliers_iso == 1]
y_iso = y[outliers_iso == 1]
Native Outlier Difficulty (LOF)
from sklearn.neighbors import LocalOutlierFactorlof = LocalOutlierFactor(n_neighbors=20)
outliers_lof = lof.fit_predict(X_scaled)
X_lof = X_scaled[outliers_lof == 1]
y_lof = y[outliers_lof == 1]
DBSCAN
from sklearn.cluster import DBSCANdbscan = DBSCAN(eps=3, min_samples=2)
outliers_dbscan = dbscan.fit_predict(X_scaled)
X_dbscan = X_scaled[outliers_dbscan != -1]
y_dbscan = y[outliers_dbscan != -1]
Elliptic Envelope
from sklearn.covariance import EllipticEnvelopeelliptic_env = EllipticEnvelope(contamination=0.1)
outliers_elliptic = elliptic_env.fit_predict(X_scaled)
X_elliptic = X_scaled[outliers_elliptic == 1]
y_elliptic = y[outliers_elliptic == 1]
One-Class SVM
from sklearn.svm import OneClassSVMone_class_svm = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
outliers_svm = one_class_svm.fit_predict(X_scaled)
X_svm = X_scaled[outliers_svm == 1]
y_svm = y[outliers_svm == 1]
Splitting Data into Teaching and Validation Models
# Break up the information into teaching and validation models
X_train_iso, X_val_iso, y_train_iso, y_val_iso = train_test_split(X_iso, y_iso, test_size=0.2, random_state=42)
X_train_lof, X_val_lof, y_train_lof, y_val_lof = train_test_split(X_lof, y_lof, test_size=0.2, random_state=42)
X_train_dbscan, X_val_dbscan, y_train_dbscan, y_val_dbscan = train_test_split(X_dbscan, y_dbscan, test_size=0.2, random_state=42)
X_train_elliptic, X_val_elliptic, y_train_elliptic, y_val_elliptic = train_test_split(X_elliptic, y_elliptic, test_size=0.2, random_state=42)
X_train_svm, X_val_svm, y_train_svm, y_val_svm = train_test_split(X_svm, y_svm, test_size=0.2, random_state=42)
Teaching and Evaluating the Model
# Function to educate and think about the model
def train_evaluate_model(X_train, y_train, X_val, y_val):
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.match(X_train, y_train)
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f'Validation MSE: {mse:.2f}, R2: {r2:.2f}')
return model, mse, r2# Genuine Data (use all info for distinctive model)
X_train_orig, X_val_orig, y_train_orig, y_val_orig = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print("Genuine Data:")
model_original, mse_original, r2_original = train_evaluate_model(X_train_orig, y_train_orig, X_val_orig, y_val_orig)
# Isolation Forest
print("Isolation Forest:")
model_iso, mse_iso, r2_iso = train_evaluate_model(X_train_iso, y_train_iso, X_val_iso, y_val_iso)
# Native Outlier Difficulty
print("Native Outlier Difficulty:")
model_lof, mse_lof, r2_lof = train_evaluate_model(X_train_lof, y_train_lof, X_val_lof, y_val_lof)
# DBSCAN
print("DBSCAN:")
model_dbscan, mse_dbscan, r2_dbscan = train_evaluate_model(X_train_dbscan, y_train_dbscan, X_val_dbscan, y_val_dbscan)
# Elliptic Envelope
print("Elliptic Envelope:")
model_elliptic, mse_elliptic, r2_elliptic = train_evaluate_model(X_train_elliptic, y_train_elliptic, X_val_elliptic, y_val_elliptic)
# One-Class SVM
print("One-Class SVM:")
model_svm, mse_svm, r2_svm = train_evaluate_model(X_train_svm, y_train_svm, X_val_svm, y_val_svm)
Isolation Forest
- Validation MSE: 2571283975.17
- R²: 0.7984
- Outliers Detected: 1445
Isolation Forest works by isolating observations in a attribute home. It efficiently identifies every worldwide and native outliers. The slight decrease in R² and the rise in MSE compared with the distinctive info level out that eradicating outliers won’t have been as helpful as anticipated for this dataset.
Native Outlier Difficulty (LOF)
- Validation MSE: 2318421326.61
- R²: 0.8295
- Outliers Detected: 221
LOF identifies native outliers based on the density of the information elements. It efficiently detects native deviations in density, which can be important for positive datasets. The quite a few enchancment in every MSE and R² reveals that LOF was extraordinarily environment friendly in determining and eradicating outliers that negatively impacted the model effectivity.
DBSCAN
- Validation MSE: 2369209176.69
- R²: 0.8135
- Outliers Detected: 12
DBSCAN is a clustering algorithm that moreover identifies outliers as elements that do not belong to any cluster. The outcomes are much like the distinctive info, suggesting that whereas DBSCAN eradicated some outliers, it didn’t significantly have an effect on the model effectivity.
Elliptic Envelope
- Validation MSE: 2629662492.59
- R²: 0.7974
- Outliers Detected: 1445
Elliptic Envelope matches a Gaussian distribution to the information and identifies outliers as elements lying exterior the fitted ellipse. The slight decrease in effectivity compared with the distinctive info signifies that the Gaussian assumption won’t keep for this dataset, leading to a lot much less environment friendly outlier detection and elimination.
One-Class SVM
- Validation MSE: 2512428988.46
- R²: 0.7924
- Outliers Detected: 1444
One-Class SVM makes use of a kernel function to review a alternative function that distinguishes common info elements from outliers. The decrease in R² and improve in MSE compared with the distinctive info advocate that this method was a lot much less environment friendly for this express dataset.