Import packages and dataset

In [2]:
import pandas as pd
import seaborn as sns
In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [4]:
train.head()
Out[4]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

In [5]:
test.head()
Out[5]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1461 20 RH 80.0 11622 Pave NaN Reg Lvl AllPub ... 120 0 NaN MnPrv NaN 0 6 2010 WD Normal
1 1462 20 RL 81.0 14267 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN Gar2 12500 6 2010 WD Normal
2 1463 60 RL 74.0 13830 Pave NaN IR1 Lvl AllPub ... 0 0 NaN MnPrv NaN 0 3 2010 WD Normal
3 1464 60 RL 78.0 9978 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 6 2010 WD Normal
4 1465 120 RL 43.0 5005 Pave NaN IR1 HLS AllPub ... 144 0 NaN NaN NaN 0 1 2010 WD Normal

5 rows × 80 columns

In [6]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 non-null object
MasVnrArea       1452 non-null float64
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1423 non-null object
BsmtCond         1423 non-null object
BsmtExposure     1422 non-null object
BsmtFinType1     1423 non-null object
BsmtFinSF1       1460 non-null int64
BsmtFinType2     1422 non-null object
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
Heating          1460 non-null object
HeatingQC        1460 non-null object
CentralAir       1460 non-null object
Electrical       1459 non-null object
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int64
Functional       1460 non-null object
Fireplaces       1460 non-null int64
FireplaceQu      770 non-null object
GarageType       1379 non-null object
GarageYrBlt      1379 non-null float64
GarageFinish     1379 non-null object
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
GarageQual       1379 non-null object
GarageCond       1379 non-null object
PavedDrive       1460 non-null object
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
PoolQC           7 non-null object
Fence            281 non-null object
MiscFeature      54 non-null object
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SaleType         1460 non-null object
SaleCondition    1460 non-null object
SalePrice        1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [7]:
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-null object
MasVnrType       1443 non-null object
MasVnrArea       1444 non-null float64
ExterQual        1459 non-null object
ExterCond        1459 non-null object
Foundation       1459 non-null object
BsmtQual         1415 non-null object
BsmtCond         1414 non-null object
BsmtExposure     1415 non-null object
BsmtFinType1     1417 non-null object
BsmtFinSF1       1458 non-null float64
BsmtFinType2     1417 non-null object
BsmtFinSF2       1458 non-null float64
BsmtUnfSF        1458 non-null float64
TotalBsmtSF      1458 non-null float64
Heating          1459 non-null object
HeatingQC        1459 non-null object
CentralAir       1459 non-null object
Electrical       1459 non-null object
1stFlrSF         1459 non-null int64
2ndFlrSF         1459 non-null int64
LowQualFinSF     1459 non-null int64
GrLivArea        1459 non-null int64
BsmtFullBath     1457 non-null float64
BsmtHalfBath     1457 non-null float64
FullBath         1459 non-null int64
HalfBath         1459 non-null int64
BedroomAbvGr     1459 non-null int64
KitchenAbvGr     1459 non-null int64
KitchenQual      1458 non-null object
TotRmsAbvGrd     1459 non-null int64
Functional       1457 non-null object
Fireplaces       1459 non-null int64
FireplaceQu      729 non-null object
GarageType       1383 non-null object
GarageYrBlt      1381 non-null float64
GarageFinish     1381 non-null object
GarageCars       1458 non-null float64
GarageArea       1458 non-null float64
GarageQual       1381 non-null object
GarageCond       1381 non-null object
PavedDrive       1459 non-null object
WoodDeckSF       1459 non-null int64
OpenPorchSF      1459 non-null int64
EnclosedPorch    1459 non-null int64
3SsnPorch        1459 non-null int64
ScreenPorch      1459 non-null int64
PoolArea         1459 non-null int64
PoolQC           3 non-null object
Fence            290 non-null object
MiscFeature      51 non-null object
MiscVal          1459 non-null int64
MoSold           1459 non-null int64
YrSold           1459 non-null int64
SaleType         1458 non-null object
SaleCondition    1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB
In [8]:
df = train.append(test)
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\pandas\core\frame.py:6201: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

To retain the current behavior and silence the warning, pass sort=False

  sort=sort)

Label encode ordinal categorical columns

In [21]:
#label encoder (does label encoding automatically, sklearn preprocessing)
ordered_BsmtExposure = ['No','Mn','Av','Gd']
df['BsmtExposure']=df.BsmtExposure.astype("category",ordered=True,categories=ordered_BsmtExposure).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [24]:
ordered_BsmtQual = ['Po','Fa','Ta','Gd','Ex']
df['BsmtQual']=df.BsmtQual.astype("category",ordered=True,categories=ordered_BsmtQual).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [25]:
ordered_Fence = ['MnWw', 'GdWo' ,'MnPrv','GdPrv']
df['Fence']=df.Fence.astype("category",ordered=True,categories=ordered_Fence).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [26]:
ordered_ExterCond = ['Po','Fa','Ta','Gd','Ex']
df['ExterCond']=df.ExterCond.astype("category",ordered=True,categories=ordered_ExterCond).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [27]:
ordered_ExterQual = ['Po','Fa','Ta','Gd','Ex']
df['ExterQual']=df.ExterQual.astype("category",ordered=True,categories=ordered_ExterQual).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [29]:
ordered_FireplaceQu = ['Po','Fa','Ta','Gd','Ex']
df['FireplaceQu']=df.FireplaceQu.astype("category",ordered=True,categories=ordered_FireplaceQu).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [30]:
ordered_Functional = ['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ']
df['Functional']=df.Functional.astype("category",ordered=True,categories=ordered_Functional).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [31]:
ordered_GarageQual = ['Po','Fa','Ta','Gd','Ex']
df['GarageQual']=df.GarageQual.astype("category",ordered=True,categories=ordered_GarageQual).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [32]:
ordered_HeatingQC = ['Po','Fa','Ta','Gd','Ex']
df['HeatingQC']=df.HeatingQC.astype("category",ordered=True,categories=ordered_HeatingQC).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [33]:
ordered_KitchenQual = ['Po','Fa','Ta','Gd','Ex']
df['KitchenQual']=df.KitchenQual.astype("category",ordered=True,categories=ordered_KitchenQual).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [93]:
ordered_GarageCond = ['Po','Fa','Ta','Gd','Ex']
df['GarageCond']=df.GarageCond.astype("category",ordered=True,categories=ordered_GarageCond).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [34]:
ordered_LandSlope = ['Gtl','Mod','Sev']
df['LandSlope']=df.LandSlope.astype("category",ordered=True,categories=ordered_LandSlope).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [35]:
ordered_LotShape = ['IR3','IR2','IR1','Reg']
df['LotShape']=df.LotShape.astype("category",ordered=True,categories=ordered_LotShape).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [37]:
ordered_PoolQC = ['Fa','Ta','Gd','Ex']
df['PoolQC']=df.PoolQC.astype("category",ordered=True,categories=ordered_PoolQC).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [94]:
ordered_GarageFinish = ['Fin','RFn','Unf']
df['GarageFinish']=df.GarageFinish.astype("category",ordered=True,categories=ordered_GarageFinish).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [68]:
ordered_BsmtCond = ['Po','Fa','Ta','Gd','Ex']
df['BsmtCond']=df.BsmtCond.astype("category",ordered=True,categories=ordered_BsmtCond).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
In [51]:
df.Utilities.head()
Out[51]:
0    AllPub
1    AllPub
2    AllPub
3    AllPub
4    AllPub
Name: Utilities, dtype: object
In [52]:
ordered_Utilities = ['AllPub','NoSewr','NoSeWa','ELO']
df['Utilities']=df.Utilities.astype("category",ordered=True,categories=ordered_Utilities).cat.codes
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  

Add in the attributes generated through feature engineering into the dataset

In [54]:
df['mult']=df['GarageCars']*df['GrLivArea']*df['OverallQual']
In [63]:
df['TotalBath']=df['BsmtFullBath']+df['BsmtHalfBath']+df['FullBath']+df['HalfBath']
In [55]:
df['add']=df['FullBath']+df['Fireplaces']+df['GarageCars']+df['OverallQual']
In [64]:
df['TotalSF']=df['TotalBsmtSF']+df['OpenPorchSF']+df['1stFlrSF']+df['2ndFlrSF']+df['MasVnrArea']+df['WoodDeckSF']+df['GarageArea']+df['3SsnPorch']+df['PoolArea']+df['ScreenPorch']-df['LowQualFinSF']

Fill in missing values or similar values with -1,'null', or 'None'

In [62]:
df.YearRemodAdd[df['YearRemodAdd']==df['YearBuilt']]=-1
C:\User_Files\Lucy_Wan\Programming\Anaconda2\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
In [67]:
df['Alley'].fillna('None', inplace = True)
In [69]:
df.BsmtFinSF1.fillna(-1,inplace = True)
In [70]:
df.BsmtFinSF2.fillna(-1,inplace = True)
In [71]:
df.MasVnrArea.fillna(-1,inplace = True)
In [72]:
df.MasVnrType.fillna('Null',inplace = True)
In [73]:
df.MiscFeature.fillna('None',inplace = True)
In [74]:
df.SaleType.fillna('Null',inplace = True)
In [75]:
df.TotalBsmtSF.fillna(-1,inplace = True)
In [78]:
df['mult'].fillna(-1,inplace = True)
df['add'].fillna(-1,inplace = True)
df.TotalBath.fillna(-1,inplace = True)
df.TotalSF.fillna(-1,inplace = True)
In [80]:
df.Exterior1st.fillna('null', inplace = True)
In [81]:
df.Exterior2nd.fillna('null', inplace = True)
In [82]:
df.Electrical.fillna('null', inplace = True)
In [83]:
df.BsmtUnfSF.fillna(-1,inplace = True)
In [84]:
df.BsmtFullBath.fillna(-1,inplace = True)
In [85]:
df.BsmtHalfBath.fillna(-1,inplace = True)
In [86]:
df.BsmtFinType1.fillna('none',inplace = True)
In [87]:
df.BsmtFinType2.fillna('none',inplace = True)
In [91]:
df.GarageArea.fillna(-1,inplace = True)
In [92]:
df.GarageCars.fillna(-1,inplace = True)
In [95]:
df.GarageType.fillna('null',inplace = True)
In [96]:
df.GarageYrBlt.fillna(-1,inplace = True)
In [97]:
df.LotFrontage.fillna(-1,inplace = True)
In [98]:
df.MSZoning.fillna('null',inplace = True)

Export the cleaned dataset into a csv file

In [99]:
df.to_csv('cleaned2.csv')