Cancer Data Logistic Regression¶
import pandas as pd
import os
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from scipy.stats import chi2_contingency
data_root = "./"
filename = "Cancer_Data.csv"
filepath = os.path.join(data_root, filename)
df = pd.read_csv(filepath)
def get_data(filename):
df = pd.read_csv(filename)
df.drop(['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
return df
df.columns
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'], dtype='object')
df.drop(['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
df
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | NaN |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | NaN |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | NaN |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | NaN |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
564 | 926424 | M | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | NaN |
565 | 926682 | M | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | NaN |
566 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | NaN |
567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | NaN |
568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | NaN |
569 rows × 13 columns
def pre_process_data(df, one_hot_encode = False):
# Use sklearn Imputers to fill in the categorical and numerical columns
simple_median = SimpleImputer(strategy='median')
simple_most_freq = SimpleImputer(strategy='most_frequent')
num_cols = df.select_dtypes(include=np.number).columns # numerical data
cat_cols = df.select_dtypes(include=object).columns # categorical data
df[num_cols] = simple_median.fit_transform(df[num_cols])
df[cat_cols] = simple_most_freq.fit_transform(df[cat_cols])
if one_hot_encode:
O_encoder = OrdinalEncoder()
df[cat_cols]= O_encoder.fit_transform(df[cat_cols])
# df = pd.get_dummies(df, dtype=int)
return df
def get_test_train(df, test_size = 0.2, random_state = True):
target = "diagnosis"
X = df.drop(target, axis=1)
# add a scaler here. It works by finding a fit first (computing mu and sigma)
scaler = preprocessing.StandardScaler().fit(X)
# and then transforming the data
X = scaler.transform(X)
y = df[target]
if random_state is True:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
else:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=42)
return X_train, X_test, y_train, y_test
Start Work Here¶
Replace lreg with LogisticRegression
# These lines would load the data locally
data_root = "./"
filename = "cancer_data_cleaned.csv"
filepath = os.path.join(data_root, filename)
# Perform a logistic regression
df = get_data(filepath)
df = pre_process_data(df, one_hot_encode = True)
X_train, X_test, y_train, y_test = get_test_train(df, random_state = True)
lreg = ??????????????????
model = lreg.fit(X_train, y_train)
Cell In[10], line 10 lreg = ?????????????????? ^ SyntaxError: invalid syntax
Get the model score
pred = lreg.predict(X_test)
print(f"Train accuracy = {model.score(X_train, y_train):.3}")
print(f"Test accuracy = {model.score(X_test, y_test):.3}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[11], line 1 ----> 1 pred = lreg.predict(X_test) 2 print(f"Train accuracy = {model.score(X_train, y_train):.3}") 3 print(f"Test accuracy = {model.score(X_test, y_test):.3}") NameError: name 'lreg' is not defined
Quick snapshot of the confusion matrix (rows are truth 0/1 and cols are predictions 0/1). You can shift-tab on the parens the see the method signature)
confusion_matrix(???????????)
Cell In[12], line 1 confusion_matrix(???????????) ^ SyntaxError: invalid syntax
We want to get the probabilites from X-test, NOT the classifications. So we want raw values in (0,1). Edit this cell
# Get predicted probabilities for the test data
# You need to **CHANGE THIS CODE** and return only a vector of the probabilities for class = 1, which is the second column
y_prob = model.predict_proba(X_test)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[13], line 3 1 # Get predicted probabilities for the test data 2 # You need to **CHANGE THIS CODE** and return only a vector of the probabilities for class = 1, which is the second column ----> 3 y_prob = model.predict_proba(X_test) NameError: name 'model' is not defined
If we sort y_prob and y_test in the same order, then we can make a reasonable plot
# Sort the data for plotting
sorted_indices = np.argsort(y_prob) ## argsort returns the indices ordered by the key values, so we can copy the sort order around
sorted_y_prob = y_prob[sorted_indices]
sorted_y_test = np.array(y_test)[sorted_indices]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[14], line 2 1 # Sort the data for plotting ----> 2 sorted_indices = np.argsort(y_prob) ## argsort returns the indices ordered by the key values, so we can copy the sort order around 3 sorted_y_prob = y_prob[sorted_indices] 4 sorted_y_test = np.array(y_test)[sorted_indices] NameError: name 'y_prob' is not defined
Print the vector of sorted_y_prob and of sorted_y_test to verify the are generally increasing from 0 to 1
## Your code here
Now make a plot of a sigmoid curve against the 0/1 ground truth values. Fill in the code below as needed
# Plot the sigmoid curve (predicted probabilities)
plt.figure(figsize=(8, 6))
# Plot the sorted_y_prob vector as a line, labeled "Predicted probability" in blue
plt.plot(????????????)
# Plot the sorted_y_test values as a scatter plot, labeled "Actual Probability" in red
plt.scatter(??????????????????????)
# Plot the cutoff line (decision boundary at 0.5)
plt.axhline(0.5, color='red', linestyle='--', label='Decision boundary (0.5)')
plt.title('Logistic Regression Sigmoid Curve')
plt.xlabel('Feature value')
plt.ylabel('Predicted probability')
plt.legend()
plt.show()
Cell In[16], line 5 plt.plot(????????????) ^ SyntaxError: invalid syntax
Binary classification for logistic regression relies on knowing where the 'split point' is. It default to alpha = 0.5, but this may not be optimal. You will define some helper functions to determine an optimal alpha.
def binary_classify(y_prob, alpha):
# return a vector where index i is 1 if y_prob[i] > alpha, else 0
def tp(true, observed):
# return the number of true positives, e.g. indices i where true[i] = observed[i] = 1
def tn(true, observed):
return 0
def fp(true, observed):
return 0
def fn(true, observed):
return 0
def precision(true, observed):
return 0
def recall(true, observed):
return 0
def score(true, observed, weights):
# return a weighted score of tp, fp, tn fn according to the weights
tps, fps, tns, fns = weights
return 0
Cell In[17], line 4 def tp(true, observed): ^ IndentationError: expected an indented block after function definition on line 1
Test your code works and get a high precision
observed = binary_classify(sorted_y_prob, 0.5)
q = zip(observed, sorted_y_test)
print(precision(sorted_y_test, observed))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[18], line 1 ----> 1 observed = binary_classify(sorted_y_prob, 0.5) 2 q = zip(observed, sorted_y_test) 3 print(precision(sorted_y_test, observed)) NameError: name 'binary_classify' is not defined
Create a plot of recall vs precision for various alpha. Let alpha from from 0 to 100 in steps of 0.01. Compute the new classification, then the precision and recall score. Then plot a scatter plot of (precision(alpha), recall(alpha)) points. Code is provided to plot the alphas along the graph.
ps = []
rs = []
for i in range(100):
alpha = i/100.0
# finish code
plt.plot(ps, rs)
# Don't edit the following, it makes labels. You can tweak it once you understand if you want
# Annotate alpha values at selected points (for example, every 10th point)
for i in range(0, 100, 10): # You can change the step size (10) to select more/fewer points
alpha = i / 100.0
plt.annotate(f'α={alpha:.2f}', (ps[i], rs[i]),
textcoords="offset points", xytext=(5, -5), ha='center')
plt.grid()
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[19], line 13 11 for i in range(0, 100, 10): # You can change the step size (10) to select more/fewer points 12 alpha = i / 100.0 ---> 13 plt.annotate(f'α={alpha:.2f}', (ps[i], rs[i]), 14 textcoords="offset points", xytext=(5, -5), ha='center') 15 plt.grid() IndexError: list index out of range
Now we want to see how weighted scores vary as a function of alpha. Start with a score vector of (0,1,0,3) (why?)
ps = []
rs = []
scores = []
# code here
Get the index of the lowest score and the lowest score itself
np.argmin(scores), np.min(scores)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[21], line 1 ----> 1 np.argmin(scores), np.min(scores) File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:1395, in argmin(a, axis, out, keepdims) 1307 """ 1308 Returns the indices of the minimum values along an axis. 1309 (...) 1392 (2, 1, 4) 1393 """ 1394 kwds = {'keepdims': keepdims} if keepdims is not np._NoValue else {} -> 1395 return _wrapfunc(a, 'argmin', axis=axis, out=out, **kwds) File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:54, in _wrapfunc(obj, method, *args, **kwds) 52 bound = getattr(obj, method, None) 53 if bound is None: ---> 54 return _wrapit(obj, method, *args, **kwds) 56 try: 57 return bound(*args, **kwds) File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:46, in _wrapit(obj, method, *args, **kwds) 43 # As this already tried the method, subok is maybe quite reasonable here 44 # but this follows what was done before. TODO: revisit this. 45 arr, = conv.as_arrays(subok=False) ---> 46 result = getattr(arr, method)(*args, **kwds) 48 return conv.wrap(result, to_scalar=False) ValueError: attempt to get argmin of an empty sequence
Properly define opt_alpha based on the scores vector
opt_alpha = ????
observed = binary_classify(sorted_y_prob, opt_alpha)
ps = precision(sorted_y_test, observed)
rs = recall(sorted_y_test, observed)
opt_tp = tp(sorted_y_test, observed)
opt_tn = tn(sorted_y_test, observed)
opt_fp = fp(sorted_y_test, observed)
opt_fn = fn(sorted_y_test, observed)
Cell In[22], line 1 opt_alpha = ???? ^ SyntaxError: invalid syntax
Make the following line nicer -- what are you printing? Give labels
print(ps, rs, opt_tp, opt_tn, opt_tp, opt_fn)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[23], line 1 ----> 1 print(ps, rs, opt_tp, opt_tn, opt_tp, opt_fn) NameError: name 'opt_tp' is not defined
Now draw a nice confusion matrix. No need to edit the code. How do you interpret this? Change the weights above and redo a few times. What happens?
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(sorted_y_test, observed, labels=lreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=lreg.classes_)
disp.plot()
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[24], line 3 1 from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay ----> 3 cm = confusion_matrix(sorted_y_test, observed, labels=lreg.classes_) 4 disp = ConfusionMatrixDisplay(confusion_matrix=cm, 5 display_labels=lreg.classes_) 6 disp.plot() NameError: name 'sorted_y_test' is not defined