import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning


#read in data
url = 'https://philchodrow.github.io/PIC16A/datasets/palmer_penguins.csv'
penguins = pd.read_csv("palmer_penguins.csv")

#taking a sneak-peek at the data
penguins.head(3)


#isolate columns of interest from the penguins dataset
explore = penguins[["Species","Island","Culmen Depth (mm)","Body Mass (g)","Sex", 
                    "Culmen Length (mm)","Flipper Length (mm)","Delta 15 N (o/oo)",
                    "Delta 13 C (o/oo)"]]

#functions should always have a description, such that when the help method is invoked on your custom functions,
#there will be informative information on what the function does.
def infomaker(df):
    """
    Function will take the penguins dataframe and make a new dataframe containing the means of numerical columns
    """
    info = df[["Culmen Depth (mm)","Body Mass (g)","Sex", "Culmen Length (mm)","Flipper Length (mm)","Delta 15 N (o/oo)","Delta 13 C (o/oo)"]].mean()
    return info


#apply infomaker function on groups of the explore dataframe based on "Species","Island","Sex"
show = explore.groupby(["Species","Island","Sex"]).apply(infomaker)
display(show)

fig, ax = plt.subplots(1)
fig.set_size_inches(10,10)
plt.title("Culmen Depth and Body Mass by Species and Island")

sns.scatterplot(
    x = "Culmen Depth (mm)",
    y = "Body Mass (g)",
    hue = "Species", #hue will automtically parse the different unique ID's in the category
    style = "Island", #style will automatically parse the different unique ID's in the category
    palette = "deep",
    s = 200,
    alpha = 0.7,
    data = explore)

plt.legend(loc="lower right", bbox_to_anchor=(1.5, 0))
plt.tight_layout
plt.show()

fig, ax = plt.subplots(1)
fig.set_size_inches(14,8)
plt.title("Spread of Species on Islands")
plt.ylabel("Density")
sns.histplot(data=penguins,  
             x="Island", 
             hue = "Species", 
             multiple = "dodge", 
             palette = "pastel",
             shrink = 0.7)

plt.tight_layout
plt.show()


def convert(df_original):
    """
    This function will take in the penguins dataframe, select specified columns to create a new dataframe.
    Then, it will be cleansed of any N/A values, have its qualitative features go through labelencoder transformation,
    and finally split the data into the predictor and target variable
    """
    df = df_original.copy()
    #isolate columns found relevant from exploratory analysis
    df = df[["Species", "Island","Culmen Depth (mm)","Body Mass (g)", "Culmen Length (mm)","Flipper Length (mm)"]]
    df = df.dropna()

    #LabelEncoder will change labels into numerical values in alphabetical order (of labels)
    le = preprocessing.LabelEncoder()
    df['Species'] = le.fit_transform(df['Species'])
    #Adelie = 0, Chinstrap = 1, Gentoo = 2
    df['Island'] = le.fit_transform(df['Island'])
    #Biscoe Island = 0, Dream = 1, Torgersen = 2
    
    #splitting data
    X = df.drop(['Species'], axis = 1)
    y = df['Species']
        
    return(X, y)


train, test = train_test_split (penguins,test_size = .3)
train_X, train_y = convert(train)
test_X, test_y = convert(test)


train_X.head()


train_y.head()

164    1
242    2
179    1
162    1
2      0
Name: Species, dtype: int32


combos = [
          ["Island", "Body Mass (g)", "Flipper Length (mm)"],
          ["Island", "Body Mass (g)", "Culmen Depth (mm)"],
          ["Island", "Body Mass (g)", "Culmen Length (mm)"],
          ["Island", "Culmen Depth (mm)", "Flipper Length (mm)"],
          ["Island", "Culmen Depth (mm)", "Culmen Length (mm)"],
          ["Island", "Culmen Length (mm)", "Flipper Length (mm)"],
]


LR = LogisticRegression()
vecml = SVC()


def cross_val_checker(c):
    """
    This function will take advantage of a validation method called cross validation, which will 
    provide a more robust evaluation of a ML classifier's accuracy based on the combos list.
    The two returned values will be a the highest accuracy and the combination that yieled that accuracy
    """
    best_perm=-np.inf
    N=len(combos)
    scores=np.zeros(N)

    #iterate over the amount of combos
    for d in range(1,N+1):
        cols = combos[d-1]
        #store the cross_val_score 
        scores[d-1] =cross_val_score(c,train_X[cols],train_y,cv=5).mean()
        #update statement for the best score/column combo
        if scores[d-1]>best_perm:
            best_perm = scores[d-1]
            best_cols = cols
        
    return best_perm, best_cols


simplefilter("ignore", category=ConvergenceWarning)


LR_best_perm, LR_best_cols = cross_val_checker(LR)
vecml_best_perm, vecml_best_cols = cross_val_checker(vecml)


print(LR_best_cols, vecml_best_cols)

['Island', 'Culmen Depth (mm)', 'Culmen Length (mm)'] ['Island', 'Culmen Depth (mm)', 'Culmen Length (mm)']


# best permutation score and corresponding columns that produced the score
LR_best_perm, LR_best_cols

(0.9749113475177305, ['Island', 'Culmen Depth (mm)', 'Culmen Length (mm)'])


from sklearn.model_selection import GridSearchCV

#param_grid contains a dictionary of parameters we want to evaluate that exist in LR
param_grid = [
        {
            'solver' : ['newton-cg','lbfgs','liblinear','sag','saga'],
            'max_iter' : [100, 1000, 5000]
        }
       ]

LR_clf = GridSearchCV(LR, param_grid = param_grid, cv=3, scoring='accuracy')
LR_clf.fit(train_X[LR_best_cols], train_y)

LR_clf.best_params_

{'max_iter': 100, 'solver': 'newton-cg'}


LR_best = LogisticRegression(max_iter= 100, solver= 'lbfgs')
LR_fit = LR_best.fit(train_X[LR_best_cols], train_y)

LR_best.score(test_X[LR_best_cols],test_y)

0.9805825242718447


# best permutation score and corresponding columns that produced the score
vecml_best_perm, vecml_best_cols

(0.8827127659574469, ['Island', 'Culmen Depth (mm)', 'Culmen Length (mm)'])


param_grid = [
        {
            'kernel' : ["linear","poly","rbf","sigmoid"],
            'gamma' : ['scale','auto'],
            'max_iter' : [1000, 5000, 10000, -1]
        }
       ]

clf = GridSearchCV(vecml, param_grid = param_grid, cv=5, scoring='accuracy')
clf.fit(train_X[vecml_best_cols], train_y)

clf.best_params_

{'gamma': 'scale', 'kernel': 'linear', 'max_iter': 1000}


vecml_best = SVC(gamma = "auto", kernel = "linear", max_iter = 1000)
veclml_bestfitted = vecml_best.fit(train_X[vecml_best_cols],train_y)

veclml_bestfitted.score(test_X[vecml_best_cols],test_y)

1.0


#implementing confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

#Logistic Regression Confusion Matrix
LR_pred_y = LR_best.predict(test_X[LR_best_cols])
#call the confusion matrix
LR_cm = confusion_matrix(test_y, LR_pred_y, labels=LR_fit.classes_)
#display the confusion matrix
LR_disp = ConfusionMatrixDisplay(confusion_matrix=LR_cm, display_labels=LR_fit.classes_)

LR_disp.plot(cmap=plt.cm.magma)
plt.title("LR")
plt.show()

#Support Vector Classification Confusion Matrix
SVC_pred_y = veclml_bestfitted.predict(test_X[vecml_best_cols])
SVC_cm = confusion_matrix(test_y, SVC_pred_y, labels=veclml_bestfitted.classes_)
SVC_disp = ConfusionMatrixDisplay(confusion_matrix=SVC_cm, display_labels=veclml_bestfitted.classes_)

SVC_disp.plot(cmap=plt.cm.magma)
plt.title("SVC")
plt.show()


def plot_regions(c,X,y, num_features):
    '''
    This function takes in 4 user-given parameters:
    c, the ML classifier; X, a dataframe of parameter variables; y, a dataframe of target variables;
    and num_features, the number of features observing
    This function will then fit the data, graph the data, and graph
    the ML classifier's decision regions on top of the data,
    showing its accuracy and boundaries
    '''
    
    #fit our classifier that is fed into the function
    c.fit(X,y)
    
    #for labeling 
    penguin_species = ["Adelie", "Chinstrap", "Gentoo"]
    islands = ["Biscoe", "Dream", "Torgersen"]
    
    #for contourf lines & plotting
    levels = [-1,0,1,2]
    fig, ax = plt.subplots(1, num_features, figsize = (18, 5))
    color_map = ["b", "g", "r"]
    
    for i in range(num_features):
        
        #island is 0,1,2 ("Biscoe", "Dream", "Torgersen")
        island = (X["Island"] == i)
    
        #x0,x1 store the depth/length of the specific island
        x0 = X[X["Island"] == i]["Culmen Depth (mm)"]
        x1 = X[X["Island"] == i]["Culmen Length (mm)"]
        
        #grid x/y contains many digits of the least/greatest values of 
        #the species's depth/length
        grid_x=np.linspace(x0.min(),x0.max(),501)
        grid_y=np.linspace(x1.min(),x1.max(),501)
        xx,yy=np.meshgrid(grid_x,grid_y)
        
        #flatten out xx and yy into 1D arrays
        XX=xx.ravel()
        YY=yy.ravel()
        #ZZ is a dummy var that mirrors the island chosen
        ZZ = np.ones(XX.size)*i
    
        #we will then use the data on the fitted classifier to predict on every single point of the grid
        p=c.predict(np.c_[ZZ,XX,YY])
        #reshape predict array to match grid array
        p=p.reshape(xx.shape)
        
        ax[i].contourf(xx,yy,p,levels = levels, cmap="jet",alpha=.2)
        
        for j in range(num_features):
            #loc stores the target variable dataframe that matches the island 
            #we are looking at
            loc = (y[X["Island"] == i] == j)
            ax[i].scatter(x0[loc], x1[loc], c=color_map[j], alpha = .75, label = penguin_species[j])
            ax[i].set(xlabel = "Culmen Depth (mm)", ylabel = "Culmen Length (mm)", title = "Island: " + islands[i])

#Add legend, title to the figure
    plt.legend(loc="best", bbox_to_anchor= (1.5, 0.35), fontsize = 15, markerscale = 1.25, title = "Species ID")
    fig.suptitle("Decision Regions for Model: " + str(c))
    
    plt.tight_layout()


#Calling decision region creation for LR
X,y = convert(penguins)
X = X[LR_best_cols]
plot_regions(LR_best, X, y, 3)

#Calling decision region creation for SVC
X,y = convert(penguins)
X = X[vecml_best_cols]
plot_regions(vecml_best, X, y, 3)

	studyName	Sample Number	Species	Region	Island	Stage	Individual ID	Clutch Completion	Date Egg	Culmen Length (mm)	Culmen Depth (mm)	Flipper Length (mm)	Body Mass (g)	Sex	Delta 15 N (o/oo)	Delta 13 C (o/oo)	Comments
0	PAL0708	1	Adelie Penguin (Pygoscelis adeliae)	Anvers	Torgersen	Adult, 1 Egg Stage	N1A1	Yes	11/11/07	39.1	18.7	181.0	3750.0	MALE	NaN	NaN	Not enough blood for isotopes.
1	PAL0708	2	Adelie Penguin (Pygoscelis adeliae)	Anvers	Torgersen	Adult, 1 Egg Stage	N1A2	Yes	11/11/07	39.5	17.4	186.0	3800.0	FEMALE	8.94956	-24.69454	NaN
2	PAL0708	3	Adelie Penguin (Pygoscelis adeliae)	Anvers	Torgersen	Adult, 1 Egg Stage	N2A1	Yes	11/16/07	40.3	18.0	195.0	3250.0	FEMALE	8.36821	-25.33302	NaN

Variable Name	Description
'studyName'	Name of study
'Sample Number'	Sample number
'Species Region'	Region where species originates from
'Island'	Island where sample lives
'Stage'	Stage of life of sample
'Individual ID'	ID of sample
'Clutch Completion'	Y/N
'Date Egg'	Date of egg
'Culmen Length (mm)'	Measure of culmen
'Culmen Depth (mm)'	Measure of culmen
'Flipper Length (mm)'	Measure of flipper
'Body Mass (g)'	Mass measure
'Sex'	M/F
'Delta 15 N (o/oo)'	Measure of isotope
'Delta 13 C (o/oo)'	Measure of isotope
'Comments'	Extra comments

			Culmen Depth (mm)	Body Mass (g)	Culmen Length (mm)	Flipper Length (mm)	Delta 15 N (o/oo)	Delta 13 C (o/oo)
Species	Island	Sex
Adelie Penguin (Pygoscelis adeliae)	Biscoe	FEMALE	17.704545	3369.318182	37.359091	187.181818	8.774242	-25.920176
	Biscoe	MALE	19.036364	4050.000000	40.590909	190.409091	8.872945	-25.917227
	Dream	FEMALE	17.618519	3344.444444	36.911111	187.851852	8.914803	-25.736636
	Dream	MALE	18.839286	4045.535714	40.071429	191.928571	8.984427	-25.759120
	Torgersen	FEMALE	17.550000	3395.833333	37.554167	188.291667	8.663160	-25.738735
	Torgersen	MALE	19.391304	4034.782609	40.586957	194.913043	8.919919	-25.835347
Chinstrap penguin (Pygoscelis antarctica)	Dream	FEMALE	17.588235	3527.205882	46.573529	191.735294	9.250962	-24.565405
Chinstrap penguin (Pygoscelis antarctica)	Dream	MALE	19.252941	3938.970588	51.094118	199.911765	9.464535	-24.527679
Gentoo penguin (Pygoscelis papua)	Biscoe	.	15.700000	4875.000000	44.500000	217.000000	8.041110	-26.184440
		FEMALE	14.237931	4679.741379	45.563793	212.706897	8.193405	-26.197205
		MALE	15.718033	5484.836066	49.473770	221.540984	8.303429	-26.170608

	Island	Culmen Depth (mm)	Body Mass (g)	Culmen Length (mm)	Flipper Length (mm)
164	1	17.3	3700.0	47.0	185.0
242	0	14.5	4400.0	46.5	213.0
179	1	19.0	3800.0	49.5	200.0
162	1	17.8	3800.0	46.6	193.0
2	2	18.0	3250.0	40.3	195.0

Daniel Huang

Machine Learning Penguins

🐧 Introduction To Applied Python: Penguins Analysis 🐧 ¶

Initial Set Up: Data and Package Importing¶

Exploratory Analysis ¶

As seen in the glimpse of the dataset, there is a large amount of data we need to parse through such that we find those relevant to our analysis ¶

Summary from Exploratory Analysis¶

Data Cleaning¶

MODELING¶

Feature Selection¶

Logistic Regression Classifier ¶

Support Vector Classifier¶

Testing on Unforseen Data¶

Decision Regions¶

Decision regions plots are 2D regions that display the predictions of penguins species based on our models¶

Discussion & Wrap Up