import pandas as pd
import os
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from scipy.stats import chi2_contingency

data_root = "./"
filename = "Cancer_Data.csv"
filepath = os.path.join(data_root, filename)
df = pd.read_csv(filepath)

def get_data(filename):
    df = pd.read_csv(filename)
    df.drop(['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
    return df

df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

df.drop(['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)

df

def pre_process_data(df, one_hot_encode = False):
   
 
   # Use sklearn Imputers to fill in the categorical and numerical columns
    simple_median = SimpleImputer(strategy='median')
    simple_most_freq = SimpleImputer(strategy='most_frequent')
    
    num_cols = df.select_dtypes(include=np.number).columns # numerical data
    cat_cols = df.select_dtypes(include=object).columns # categorical data

    df[num_cols] = simple_median.fit_transform(df[num_cols])
    df[cat_cols] = simple_most_freq.fit_transform(df[cat_cols])

    if one_hot_encode:
        O_encoder = OrdinalEncoder()
        df[cat_cols]= O_encoder.fit_transform(df[cat_cols])

        # df = pd.get_dummies(df, dtype=int)
    return df

def get_test_train(df, test_size = 0.2, random_state = True):
    target = "diagnosis"    
    X = df.drop(target, axis=1)
     # add a scaler here. It works by finding a fit first (computing mu and sigma)
    scaler = preprocessing.StandardScaler().fit(X)
    # and then transforming the data
    X = scaler.transform(X)
    
    y = df[target]
    if random_state is True:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

# These lines would load the data locally
data_root = "./"
filename = "cancer_data_cleaned.csv"
filepath = os.path.join(data_root, filename)

# Perform a logistic regression
df = get_data(filepath)
df = pre_process_data(df, one_hot_encode = True)
X_train, X_test, y_train, y_test = get_test_train(df, random_state = True)
lreg = ??????????????????
model = lreg.fit(X_train, y_train)

  Cell In[10], line 10
    lreg = ??????????????????
           ^
SyntaxError: invalid syntax

pred = lreg.predict(X_test)
print(f"Train accuracy  = {model.score(X_train, y_train):.3}")
print(f"Test  accuracy  = {model.score(X_test, y_test):.3}")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 pred = lreg.predict(X_test)
      2 print(f"Train accuracy  = {model.score(X_train, y_train):.3}")
      3 print(f"Test  accuracy  = {model.score(X_test, y_test):.3}")

NameError: name 'lreg' is not defined

confusion_matrix(???????????)

  Cell In[12], line 1
    confusion_matrix(???????????)
                     ^
SyntaxError: invalid syntax

# Get predicted probabilities for the test data
# You need to **CHANGE THIS CODE** and return only a vector of the probabilities for class = 1, which is the second column
y_prob = model.predict_proba(X_test)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 3
      1 # Get predicted probabilities for the test data
      2 # You need to **CHANGE THIS CODE** and return only a vector of the probabilities for class = 1, which is the second column
----> 3 y_prob = model.predict_proba(X_test)

NameError: name 'model' is not defined

# Sort the data for plotting
sorted_indices = np.argsort(y_prob)  ## argsort returns the indices ordered by the key values, so we can copy the sort order around
sorted_y_prob = y_prob[sorted_indices]
sorted_y_test = np.array(y_test)[sorted_indices]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 # Sort the data for plotting
----> 2 sorted_indices = np.argsort(y_prob)  ## argsort returns the indices ordered by the key values, so we can copy the sort order around
      3 sorted_y_prob = y_prob[sorted_indices]
      4 sorted_y_test = np.array(y_test)[sorted_indices]

NameError: name 'y_prob' is not defined

## Your code here

# Plot the sigmoid curve (predicted probabilities)
plt.figure(figsize=(8, 6))

# Plot the sorted_y_prob vector as a line, labeled "Predicted probability" in blue
plt.plot(????????????)

# Plot the sorted_y_test values as a scatter plot, labeled "Actual Probability" in red
plt.scatter(??????????????????????)

# Plot the cutoff line (decision boundary at 0.5)
plt.axhline(0.5, color='red', linestyle='--', label='Decision boundary (0.5)')

plt.title('Logistic Regression Sigmoid Curve')
plt.xlabel('Feature value')
plt.ylabel('Predicted probability')
plt.legend()
plt.show()

  Cell In[16], line 5
    plt.plot(????????????)
             ^
SyntaxError: invalid syntax

def binary_classify(y_prob, alpha):
    # return a vector where index i is 1 if y_prob[i] > alpha, else 0 
    
def tp(true, observed): 
    # return the number of true positives, e.g. indices i where true[i] = observed[i] = 1

def tn(true, observed): 
    return 0

def fp(true, observed): 
    return 0

def fn(true, observed): 
    return 0

def precision(true, observed): 
    return 0

def recall(true, observed): 
    return 0
                                 
def score(true, observed, weights):
    # return a weighted score of tp, fp, tn fn according to the weights
    tps, fps, tns, fns = weights
    return 0

  Cell In[17], line 4
    def tp(true, observed):
    ^
IndentationError: expected an indented block after function definition on line 1

observed = binary_classify(sorted_y_prob, 0.5)
q = zip(observed, sorted_y_test)
print(precision(sorted_y_test, observed))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 1
----> 1 observed = binary_classify(sorted_y_prob, 0.5)
      2 q = zip(observed, sorted_y_test)
      3 print(precision(sorted_y_test, observed))

NameError: name 'binary_classify' is not defined

ps = []
rs = []
for i in range(100):
    alpha = i/100.0
    # finish code

plt.plot(ps, rs)

# Don't edit the following, it makes labels. You can tweak it once you understand if you want
# Annotate alpha values at selected points (for example, every 10th point)
for i in range(0, 100, 10):  # You can change the step size (10) to select more/fewer points
    alpha = i / 100.0
    plt.annotate(f'α={alpha:.2f}', (ps[i], rs[i]), 
                 textcoords="offset points", xytext=(5, -5), ha='center')
plt.grid()

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[19], line 13
     11 for i in range(0, 100, 10):  # You can change the step size (10) to select more/fewer points
     12     alpha = i / 100.0
---> 13     plt.annotate(f'α={alpha:.2f}', (ps[i], rs[i]), 
     14                  textcoords="offset points", xytext=(5, -5), ha='center')
     15 plt.grid()

IndexError: list index out of range

ps = []
rs = []
scores = []
# code here

np.argmin(scores), np.min(scores)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[21], line 1
----> 1 np.argmin(scores), np.min(scores)

File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:1395, in argmin(a, axis, out, keepdims)
   1307 """
   1308 Returns the indices of the minimum values along an axis.
   1309 
   (...)
   1392 (2, 1, 4)
   1393 """
   1394 kwds = {'keepdims': keepdims} if keepdims is not np._NoValue else {}
-> 1395 return _wrapfunc(a, 'argmin', axis=axis, out=out, **kwds)

File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:54, in _wrapfunc(obj, method, *args, **kwds)
     52 bound = getattr(obj, method, None)
     53 if bound is None:
---> 54     return _wrapit(obj, method, *args, **kwds)
     56 try:
     57     return bound(*args, **kwds)

File ~/github/aet-cs/aet-cs.github.io/white/ML/env/lib/python3.11/site-packages/numpy/_core/fromnumeric.py:46, in _wrapit(obj, method, *args, **kwds)
     43 # As this already tried the method, subok is maybe quite reasonable here
     44 # but this follows what was done before. TODO: revisit this.
     45 arr, = conv.as_arrays(subok=False)
---> 46 result = getattr(arr, method)(*args, **kwds)
     48 return conv.wrap(result, to_scalar=False)

ValueError: attempt to get argmin of an empty sequence

opt_alpha = ????
observed = binary_classify(sorted_y_prob, opt_alpha)
ps = precision(sorted_y_test, observed)
rs = recall(sorted_y_test, observed)
opt_tp = tp(sorted_y_test, observed)
opt_tn = tn(sorted_y_test, observed)
opt_fp = fp(sorted_y_test, observed)
opt_fn = fn(sorted_y_test, observed)

  Cell In[22], line 1
    opt_alpha = ????
                ^
SyntaxError: invalid syntax

print(ps, rs, opt_tp, opt_tn, opt_tp, opt_fn)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 1
----> 1 print(ps, rs, opt_tp, opt_tn, opt_tp, opt_fn)

NameError: name 'opt_tp' is not defined

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(sorted_y_test, observed, labels=lreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=lreg.classes_)
disp.plot()
plt.show()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 3
      1 from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
----> 3 cm = confusion_matrix(sorted_y_test, observed, labels=lreg.classes_)
      4 disp = ConfusionMatrixDisplay(confusion_matrix=cm,
      5                               display_labels=lreg.classes_)
      6 disp.plot()

NameError: name 'sorted_y_test' is not defined

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	Unnamed: 32
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	0.2419	0.07871	NaN
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	0.1812	0.05667	NaN
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	0.2069	0.05999	NaN
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	0.2597	0.09744	NaN
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	0.1809	0.05883	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...
564	926424	M	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	0.1726	0.05623	NaN
565	926682	M	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	0.1752	0.05533	NaN
566	926954	M	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	0.1590	0.05648	NaN
567	927241	M	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	0.2397	0.07016	NaN
568	92751	B	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	0.1587	0.05884	NaN

Cancer Data Logistic Regression¶

Start Work Here¶