Import packages and dataset
import pandas as pd
import seaborn as sns
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()
test.head()
train.info()
test.info()
df = train.append(test)
Label encode ordinal categorical columns
#label encoder (does label encoding automatically, sklearn preprocessing)
ordered_BsmtExposure = ['No','Mn','Av','Gd']
df['BsmtExposure']=df.BsmtExposure.astype("category",ordered=True,categories=ordered_BsmtExposure).cat.codes
ordered_BsmtQual = ['Po','Fa','Ta','Gd','Ex']
df['BsmtQual']=df.BsmtQual.astype("category",ordered=True,categories=ordered_BsmtQual).cat.codes
ordered_Fence = ['MnWw', 'GdWo' ,'MnPrv','GdPrv']
df['Fence']=df.Fence.astype("category",ordered=True,categories=ordered_Fence).cat.codes
ordered_ExterCond = ['Po','Fa','Ta','Gd','Ex']
df['ExterCond']=df.ExterCond.astype("category",ordered=True,categories=ordered_ExterCond).cat.codes
ordered_ExterQual = ['Po','Fa','Ta','Gd','Ex']
df['ExterQual']=df.ExterQual.astype("category",ordered=True,categories=ordered_ExterQual).cat.codes
ordered_FireplaceQu = ['Po','Fa','Ta','Gd','Ex']
df['FireplaceQu']=df.FireplaceQu.astype("category",ordered=True,categories=ordered_FireplaceQu).cat.codes
ordered_Functional = ['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ']
df['Functional']=df.Functional.astype("category",ordered=True,categories=ordered_Functional).cat.codes
ordered_GarageQual = ['Po','Fa','Ta','Gd','Ex']
df['GarageQual']=df.GarageQual.astype("category",ordered=True,categories=ordered_GarageQual).cat.codes
ordered_HeatingQC = ['Po','Fa','Ta','Gd','Ex']
df['HeatingQC']=df.HeatingQC.astype("category",ordered=True,categories=ordered_HeatingQC).cat.codes
ordered_KitchenQual = ['Po','Fa','Ta','Gd','Ex']
df['KitchenQual']=df.KitchenQual.astype("category",ordered=True,categories=ordered_KitchenQual).cat.codes
ordered_GarageCond = ['Po','Fa','Ta','Gd','Ex']
df['GarageCond']=df.GarageCond.astype("category",ordered=True,categories=ordered_GarageCond).cat.codes
ordered_LandSlope = ['Gtl','Mod','Sev']
df['LandSlope']=df.LandSlope.astype("category",ordered=True,categories=ordered_LandSlope).cat.codes
ordered_LotShape = ['IR3','IR2','IR1','Reg']
df['LotShape']=df.LotShape.astype("category",ordered=True,categories=ordered_LotShape).cat.codes
ordered_PoolQC = ['Fa','Ta','Gd','Ex']
df['PoolQC']=df.PoolQC.astype("category",ordered=True,categories=ordered_PoolQC).cat.codes
ordered_GarageFinish = ['Fin','RFn','Unf']
df['GarageFinish']=df.GarageFinish.astype("category",ordered=True,categories=ordered_GarageFinish).cat.codes
ordered_BsmtCond = ['Po','Fa','Ta','Gd','Ex']
df['BsmtCond']=df.BsmtCond.astype("category",ordered=True,categories=ordered_BsmtCond).cat.codes
df.Utilities.head()
ordered_Utilities = ['AllPub','NoSewr','NoSeWa','ELO']
df['Utilities']=df.Utilities.astype("category",ordered=True,categories=ordered_Utilities).cat.codes
Add in the attributes generated through feature engineering into the dataset
df['mult']=df['GarageCars']*df['GrLivArea']*df['OverallQual']
df['TotalBath']=df['BsmtFullBath']+df['BsmtHalfBath']+df['FullBath']+df['HalfBath']
df['add']=df['FullBath']+df['Fireplaces']+df['GarageCars']+df['OverallQual']
df['TotalSF']=df['TotalBsmtSF']+df['OpenPorchSF']+df['1stFlrSF']+df['2ndFlrSF']+df['MasVnrArea']+df['WoodDeckSF']+df['GarageArea']+df['3SsnPorch']+df['PoolArea']+df['ScreenPorch']-df['LowQualFinSF']
Fill in missing values or similar values with -1,'null', or 'None'
df.YearRemodAdd[df['YearRemodAdd']==df['YearBuilt']]=-1
df['Alley'].fillna('None', inplace = True)
df.BsmtFinSF1.fillna(-1,inplace = True)
df.BsmtFinSF2.fillna(-1,inplace = True)
df.MasVnrArea.fillna(-1,inplace = True)
df.MasVnrType.fillna('Null',inplace = True)
df.MiscFeature.fillna('None',inplace = True)
df.SaleType.fillna('Null',inplace = True)
df.TotalBsmtSF.fillna(-1,inplace = True)
df['mult'].fillna(-1,inplace = True)
df['add'].fillna(-1,inplace = True)
df.TotalBath.fillna(-1,inplace = True)
df.TotalSF.fillna(-1,inplace = True)
df.Exterior1st.fillna('null', inplace = True)
df.Exterior2nd.fillna('null', inplace = True)
df.Electrical.fillna('null', inplace = True)
df.BsmtUnfSF.fillna(-1,inplace = True)
df.BsmtFullBath.fillna(-1,inplace = True)
df.BsmtHalfBath.fillna(-1,inplace = True)
df.BsmtFinType1.fillna('none',inplace = True)
df.BsmtFinType2.fillna('none',inplace = True)
df.GarageArea.fillna(-1,inplace = True)
df.GarageCars.fillna(-1,inplace = True)
df.GarageType.fillna('null',inplace = True)
df.GarageYrBlt.fillna(-1,inplace = True)
df.LotFrontage.fillna(-1,inplace = True)
df.MSZoning.fillna('null',inplace = True)
Export the cleaned dataset into a csv file
df.to_csv('cleaned2.csv')