Data Preprocessing in Python




Problem statement: Predict using independent variables if an individual is a smoker or not


Download The Dataset

Download The Code File


Variables:

Independent Variables : age, sex, bmi, children, expenses

Dependent Variable : smoker


#Importing the libraries

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd


#Change the working directory and set it as current console's working directory

#Importing the Dataset

dataset = pd.read_csv('insurance.csv')

X = dataset.iloc[:, :-1].values

y = dataset.iloc[:, 5].values

X = pd.DataFrame(X)

y = pd.DataFrame(y)



#Taking care of Missing Data

import sklearn

from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values = 'NaN', strategy = 'mean')

imputer = imputer.fit(X.values[:, 2:])

X.values[:, 2:] = imputer.transform(X.values[:, 2:])


#Encoding Categorical Data

#Encoding the Independent Variable

from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()

X.values[:, 1] = labelencoder_X.fit_transform(X.values[:, 1])



#Encoding the Dependent Variable

labelencoder_y = LabelEncoder()

y = labelencoder_y.fit_transform(y)



#Splitting the dataset into the Training set and Test set


from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)