import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Step 1: Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
    'hours-per-week', 'native-country', 'income'
]
data = pd.read_csv(url, names=column_names, na_values=" ?", skipinitialspace=True)

# Step 2: Preprocess the data
# Handle missing values by dropping rows with NaN values (you can also fill them if needed)
data.dropna(inplace=True)

# Convert categorical columns to numeric using LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

# Split the dataset into features (X) and target (y)
X = data.drop(columns=['income'])
y = data['income']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42, criterion='entropy')
model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Step 6: Visualize the Decision Tree
plt.figure(figsize=(20,10))
plot_tree(model, feature_names=X.columns, class_names=['<=50K', '>50K'], filled=True, rounded=True, fontsize=12)
plt.title("Decision Tree Visualization")
plt.show()

Decision Tree Income Data¶

EDA¶

Training Improvement¶

Conclusion¶