Cancer Data Logistic Regression¶

In [1]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from scipy.stats import chi2_contingency
In [3]:
data_root = "./"
filename = "Cancer_Data.csv"
filepath = os.path.join(data_root, filename)
df = pd.read_csv(filepath)
In [4]:
def get_data(filename):
    df = pd.read_csv(filename)
    df.drop(['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
    return df    
In [5]:
df.columns
Out[5]:
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
In [6]:
df.drop(['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
In [7]:
df
Out[7]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 0.1726 0.05623 NaN
565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 0.1752 0.05533 NaN
566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 0.1590 0.05648 NaN
567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 0.2397 0.07016 NaN
568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 0.1587 0.05884 NaN

569 rows × 13 columns

In [8]:
def pre_process_data(df, one_hot_encode = False):
   
 
   # Use sklearn Imputers to fill in the categorical and numerical columns
    simple_median = SimpleImputer(strategy='median')
    simple_most_freq = SimpleImputer(strategy='most_frequent')
    
    num_cols = df.select_dtypes(include=np.number).columns # numerical data
    cat_cols = df.select_dtypes(include=object).columns # categorical data

    df[num_cols] = simple_median.fit_transform(df[num_cols])
    df[cat_cols] = simple_most_freq.fit_transform(df[cat_cols])

    if one_hot_encode:
        O_encoder = OrdinalEncoder()
        df[cat_cols]= O_encoder.fit_transform(df[cat_cols])

        # df = pd.get_dummies(df, dtype=int)
    return df
In [9]:
def get_test_train(df, test_size = 0.2, random_state = True):
    target = "diagnosis"    
    X = df.drop(target, axis=1)
     # add a scaler here. It works by finding a fit first (computing mu and sigma)
    scaler = preprocessing.StandardScaler().fit(X)
    # and then transforming the data
    X = scaler.transform(X)
    
    y = df[target]
    if random_state is True:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

Start Work Here¶

Replace lreg with LogisticRegression

In [10]:
# These lines would load the data locally
data_root = "./"
filename = "cancer_data_cleaned.csv"
filepath = os.path.join(data_root, filename)

# Perform a logistic regression
df = get_data(filepath)
df = pre_process_data(df, one_hot_encode = True)
X_train, X_test, y_train, y_test = get_test_train(df, random_state = True)
lreg = ??????????????????
model = lreg.fit(X_train, y_train)
  Cell In[10], line 10
    lreg = ??????????????????
           ^
SyntaxError: invalid syntax

Get the model score

In [11]:
pred = lreg.predict(X_test)
print(f"Train accuracy  = {model.score(X_train, y_train):.3}")
print(f"Test  accuracy  = {model.score(X_test, y_test):.3}")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 pred = lreg.predict(X_test)
      2 print(f"Train accuracy  = {model.score(X_train, y_train):.3}")
      3 print(f"Test  accuracy  = {model.score(X_test, y_test):.3}")

NameError: name 'lreg' is not defined

Quick snapshot of the confusion matrix (rows are truth 0/1 and cols are predictions 0/1). You can shift-tab on the parens the see the method signature)

In [12]:
confusion_matrix(???????????)
  Cell In[12], line 1
    confusion_matrix(???????????)
                     ^
SyntaxError: invalid syntax

We want to get the probabilites from X-test, NOT the classifications. So we want raw values in (0,1). Edit this cell

In [13]:
# Get predicted probabilities for the test data
# You need to **CHANGE THIS CODE** and return only a vector of the probabilities for class = 1, which is the second column
y_prob = model.predict_proba(X_test)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 3
      1 # Get predicted probabilities for the test data
      2 # You need to **CHANGE THIS CODE** and return only a vector of the probabilities for class = 1, which is the second column
----> 3 y_prob = model.predict_proba(X_test)

NameError: name 'model' is not defined

If we sort y_prob and y_test in the same order, then we can make a reasonable plot

In [14]:
# Sort the data for plotting
sorted_indices = np.argsort(y_prob)  ## argsort returns the indices ordered by the key values, so we can copy the sort order around
sorted_y_prob = y_prob[sorted_indices]
sorted_y_test = np.array(y_test)[sorted_indices]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 # Sort the data for plotting
----> 2 sorted_indices = np.argsort(y_prob)  ## argsort returns the indices ordered by the key values, so we can copy the sort order around
      3 sorted_y_prob = y_prob[sorted_indices]
      4 sorted_y_test = np.array(y_test)[sorted_indices]

NameError: name 'y_prob' is not defined

Print the vector of sorted_y_prob and of sorted_y_test to verify the are generally increasing from 0 to 1

In [15]:
## Your code here

Now make a plot of a sigmoid curve against the 0/1 ground truth values. Fill in the code below as needed

In [16]:
# Plot the sigmoid curve (predicted probabilities)
plt.figure(figsize=(8, 6))

# Plot the sorted_y_prob vector as a line, labeled "Predicted probability" in blue
plt.plot(????????????)

# Plot the sorted_y_test values as a scatter plot, labeled "Actual Probability" in red
plt.scatter(??????????????????????)

# Plot the cutoff line (decision boundary at 0.5)
plt.axhline(0.5, color='red', linestyle='--', label='Decision boundary (0.5)')

plt.title('Logistic Regression Sigmoid Curve')
plt.xlabel('Feature value')
plt.ylabel('Predicted probability')
plt.legend()
plt.show()
  Cell In[16], line 5
    plt.plot(????????????)
             ^
SyntaxError: invalid syntax

Binary classification for logistic regression relies on knowing where the 'split point' is. It default to alpha = 0.5, but this may not be optimal. You will define some helper functions to determine an optimal alpha.

In [17]:
def binary_classify(y_prob, alpha):
    # return a vector where index i is 1 if y_prob[i] > alpha, else 0 
    
def tp(true, observed): 
    # return the number of true positives, e.g. indices i where true[i] = observed[i] = 1

def tn(true, observed): 
    return 0

def fp(true, observed): 
    return 0

def fn(true, observed): 
    return 0

def precision(true, observed): 
    return 0

def recall(true, observed): 
    return 0
                                 
def score(true, observed, weights):
    # return a weighted score of tp, fp, tn fn according to the weights
    tps, fps, tns, fns = weights
    return 0
  Cell In[17], line 4
    def tp(true, observed):
    ^
IndentationError: expected an indented block after function definition on line 1

Test your code works and get a high precision

In [18]:
observed = binary_classify(sorted_y_prob, 0.5)
q = zip(observed, sorted_y_test)
print(precision(sorted_y_test, observed))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 1
----> 1 observed = binary_classify(sorted_y_prob, 0.5)
      2 q = zip(observed, sorted_y_test)
      3 print(precision(sorted_y_test, observed))

NameError: name 'binary_classify' is not defined

Create a plot of recall vs precision for various alpha. Let alpha from from 0 to 100 in steps of 0.01. Compute the new classification, then the precision and recall score. Then plot a scatter plot of (precision(alpha), recall(alpha)) points. Code is provided to plot the alphas along the graph.

In [19]:
ps = []
rs = []
for i in range(100):
    alpha = i/100.0
    # finish code

plt.plot(ps, rs)

# Don't edit the following, it makes labels. You can tweak it once you understand if you want
# Annotate alpha values at selected points (for example, every 10th point)
for i in range(0, 100, 10):  # You can change the step size (10) to select more/fewer points
    alpha = i / 100.0
    plt.annotate(f'α={alpha:.2f}', (ps[i], rs[i]), 
                 textcoords="offset points", xytext=(5, -5), ha='center')
plt.grid()
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[19], line 13
     11 for i in range(0, 100, 10):  # You can change the step size (10) to select more/fewer points
     12     alpha = i / 100.0
---> 13     plt.annotate(f'α={alpha:.2f}', (ps[i], rs[i]), 
     14                  textcoords="offset points", xytext=(5, -5), ha='center')
     15 plt.grid()

IndexError: list index out of range
No description has been provided for this image

Now we want to see how weighted scores vary as a function of alpha. Start with a score vector of (0,1,0,3) (why?)

In [20]:
ps = []
rs = []
scores = []
# code here

Get the index of the lowest score and the lowest score itself

In [21]:
np.argmin(scores), np.min(scores)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[21], line 1
----> 1 np.argmin(scores), np.min(scores)

File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:1395, in argmin(a, axis, out, keepdims)
   1307 """
   1308 Returns the indices of the minimum values along an axis.
   1309 
   (...)
   1392 (2, 1, 4)
   1393 """
   1394 kwds = {'keepdims': keepdims} if keepdims is not np._NoValue else {}
-> 1395 return _wrapfunc(a, 'argmin', axis=axis, out=out, **kwds)

File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:54, in _wrapfunc(obj, method, *args, **kwds)
     52 bound = getattr(obj, method, None)
     53 if bound is None:
---> 54     return _wrapit(obj, method, *args, **kwds)
     56 try:
     57     return bound(*args, **kwds)

File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:46, in _wrapit(obj, method, *args, **kwds)
     43 # As this already tried the method, subok is maybe quite reasonable here
     44 # but this follows what was done before. TODO: revisit this.
     45 arr, = conv.as_arrays(subok=False)
---> 46 result = getattr(arr, method)(*args, **kwds)
     48 return conv.wrap(result, to_scalar=False)

ValueError: attempt to get argmin of an empty sequence

Properly define opt_alpha based on the scores vector

In [22]:
opt_alpha = ????
observed = binary_classify(sorted_y_prob, opt_alpha)
ps = precision(sorted_y_test, observed)
rs = recall(sorted_y_test, observed)
opt_tp = tp(sorted_y_test, observed)
opt_tn = tn(sorted_y_test, observed)
opt_fp = fp(sorted_y_test, observed)
opt_fn = fn(sorted_y_test, observed)
  Cell In[22], line 1
    opt_alpha = ????
                ^
SyntaxError: invalid syntax

Make the following line nicer -- what are you printing? Give labels

In [23]:
print(ps, rs, opt_tp, opt_tn, opt_tp, opt_fn)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 1
----> 1 print(ps, rs, opt_tp, opt_tn, opt_tp, opt_fn)

NameError: name 'opt_tp' is not defined

Now draw a nice confusion matrix. No need to edit the code. How do you interpret this? Change the weights above and redo a few times. What happens?

In [24]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(sorted_y_test, observed, labels=lreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=lreg.classes_)
disp.plot()
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 3
      1 from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
----> 3 cm = confusion_matrix(sorted_y_test, observed, labels=lreg.classes_)
      4 disp = ConfusionMatrixDisplay(confusion_matrix=cm,
      5                               display_labels=lreg.classes_)
      6 disp.plot()

NameError: name 'sorted_y_test' is not defined
In [ ]: