import pandas as pd
import os
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

# These lines would load the data locally
data_root = "./"
filename = "Life_Expectancy_Data.csv"
filepath = os.path.join(data_root, filename)

# We'll fetch it directly from the web
# data_url = "https://aet-cs.github.io/white/ML/lessons/Life_Expectancy_Data.csv"
df = pd.read_csv(filepath)
target = "Life expectancy"    

df.describe()

def get_data(filename):
    df = pd.read_csv(filename)
    return df

def pre_process_data(df, one_hot_encode = False):
    target = "Life expectancy"    

    # Use sklearn Imputers to fill in the categorical and numerical columns
    simple_median = SimpleImputer(strategy='median')
    simple_most_freq = SimpleImputer(strategy='most_frequent')
    
    num_cols = df.select_dtypes(include=np.number).columns # numerical data
    cat_cols = df.select_dtypes(include=object).columns # categorical data

    df[num_cols] = simple_median.fit_transform(df[num_cols])
    df[cat_cols] = simple_most_freq.fit_transform(df[cat_cols])
    
    if one_hot_encode:
        O_encoder = OrdinalEncoder()
        df[cat_cols]= O_encoder.fit_transform(df[cat_cols])

        # df = pd.get_dummies(df, dtype=int)
        
    return df

def feature_selection(df):
    df = df.drop(["under-five deaths", "Diphtheria", "thinness 1-19 years", "Polio"], axis = 1)
    return df

def get_test_train(df, test_size = 0.2, random_state = False):
    target = "Life expectancy"    
    X = df.drop(target, axis=1)
    y = df[target]
    if random_state:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    return X_train, X_test, y_train, y_test

df = get_data(filename)
df = pre_process_data(df, one_hot_encode = True)
X_train, X_test, y_train, y_test = get_test_train(df)
lreg = LinearRegression()
model = lreg.fit(X_train, y_train)

pred = lreg.predict(X_test)
print(f"Train R-squared  = {model.score(X_train, y_train):.3}")
print(f"Test  R-squared  = {model.score(X_test, y_test):.3}")

plt.barh(y = df.drop(target,axis=1).columns, width=model.coef_);
plt.title("Linear Regression Coefficients");

## Your code here

def get_test_train(df, test_size = 0.2, random_state = False):
    target = "Life expectancy"    
    X = df.drop(target, axis=1)

    # add a scaler here. It works by finding a fit first (computing mu and sigma)
    scaler = preprocessing.StandardScaler().fit(X)
    # and then transforming the data
    X = scaler.transform(X)
    
    y = df[target]
    if random_state:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    return X_train, X_test, y_train, y_test

## Your code here

## Your code here

## Your code here

coefs = []
for a in range(200):
    df = get_data(filename)
    df = pre_process_data(df, one_hot_encode = True)
    df = feature_selection(df)
    X_train, X_test, y_train, y_test = get_test_train(df)
    lreg = LinearRegression()
    model = lreg.fit(X_train, y_train)
    coefs.append(model.coef_)

ax = plt.gca()
ax.plot(range(200), coefs)
plt.xlabel("Iteration")
plt.ylabel("Weights")
plt.title("Regression coefficients as a function of the training set")
plt.axis("tight")
plt.show()

c = np.abs(model.coef_)
print(f"Ratio of largest to smallest coefficient: {np.max(c)/np.min(c): 0.5}")

n_alphas = 200

# create a list from 10^6 down to 10^0, log spaced
alphas = np.logspace(6,0, n_alphas)

# run a ridge regression for each alpha
coefs = []

# We perform this on the full dataset -- NO feature selection
df = get_data(filename)
df = pre_process_data(df, one_hot_encode = True)
df = feature_selection(df)
X_train, X_test, y_train, y_test = get_test_train(df)
for a in alphas:
    ridge = Ridge(alpha=a, fit_intercept=False)
    ridge.fit(X_train, y_train)
    coefs.append(ridge.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale("log")
plt.xlabel("alpha")
plt.ylabel("weights")
plt.title("Ridge coefficients as a function of $\\alpha$ ")
plt.axis("tight")
plt.show()

alpha = 1000

ridge = Ridge(alpha=alpha, fit_intercept=False)
ridge.fit(X_train, y_train)
plt.barh(y = df.drop(target,axis=1).columns, width=ridge.coef_);
plt.title("Linear Regression Coefficients");

c = np.abs(ridge.coef_)
print(f"Ratio of largest to smallest coefficient: {np.max(c)/np.min(c): 0.5}")

n_alphas = 200
alphas = np.logspace(1,-2, n_alphas)

coefs = []


# We perform this on the full dataset -- NO feature selection
df = get_data(filename)
df = pre_process_data(df, one_hot_encode = True)
X_train, X_test, y_train, y_test = get_test_train(df)

for a in alphas:
    lasso = Lasso(alpha=a, fit_intercept=True)
    lasso.fit(X_train, y_train)
    coefs.append(lasso.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale("log")
plt.xlabel("alpha")
plt.ylabel("weights")
plt.title("Lasso coefficients as a function of $\\alpha$")
plt.axis("tight")
plt.show()

alpha = # your alpha here

lasso = Lasso(alpha=alpha, fit_intercept=True)
lasso.fit(X_train, y_train)
plt.barh(y = df.drop(target,axis=1).columns, width=lasso.coef_);
plt.title("Linear Regression Coefficients");
print(f"Score on test set {lasso.score(X_test, y_test):.3}")

results = pd.DataFrame(np.array([df.drop(target,axis=1).columns, lasso.coef_]).T, columns = ['feature', 'coeff'])

filtered_results = results[abs(results['coeff'])>0].sort_values(by='coeff')

plt.barh(y = 'feature', width='coeff', data = filtered_results);
plt.title("Lasso Regression Coefficients");

scores = []
for i in range(100):
    df = get_data(filename)
    df = pre_process_data(df, one_hot_encode = True)
    X_train, X_test, y_train, y_test = get_test_train(df)
    lasso = Lasso(alpha=alpha, fit_intercept=True)
    model = lasso.fit(X_train, y_train)
    scores += [model.score(X_test, y_test)]
plt.scatter(x = range(len(scores)), y = scores);
print(f"Linear Model: Mean score = {np.mean(scores):.3} Stdev = {np.std(scores):.3}")

Life Expectancy -- Normalization, Regularization, Feature Selection¶

First Linear Model¶

Feature Normalization¶

Eliminating Collinearity¶

Ridge Regression¶

Lasso Regression¶