# weather.csv file downloaded from https://www.kaggle.com/c/predict-west-nile-virus/data

%matplotlib inline
import sys
sys.path.append("../bin/")
from data import DataIn
import pandas as pd
from preprocess import summary

if __name__ == "__main__":
    weather = DataIn("weather.csv")
    weather.summarize()

No missing values in the columns of weather.csv!

--------------------------------------------------------------------------------
********************    Begin of the summary of text data   ********************
--------------------------------------------------------------------------------
count           2944
unique          1472
top       2011-08-18
freq               2
Name: Date, dtype: object


count     2944
unique      60
top         73
freq       138
Name: Tavg, dtype: object


count     2944
unique      42
top          M
freq      1472
Name: Depart, dtype: object


count     2944
unique      48
top         63
freq       135
Name: WetBulb, dtype: object


count     2944
unique      31
top          0
freq      1870
Name: Heat, dtype: object


count     2944
unique      31
top          0
freq      1147
Name: Cool, dtype: object


count     2944
unique     122
top          -
freq      1472
Name: Sunrise, dtype: object


count     2944
unique     119
top          -
freq      1472
Name: Sunset, dtype: object


count     2944
unique      98
top           
freq      1609
Name: CodeSum, dtype: object


count     2944
unique       2
top          0
freq      1472
Name: Depth, dtype: object


count     2944
unique       1
top          M
freq      2944
Name: Water1, dtype: object


count     2944
unique       4
top          M
freq      1472
Name: SnowFall, dtype: object


count     2944
unique     168
top       0.00
freq      1577
Name: PrecipTotal, dtype: object


count      2944
unique      104
top       29.34
freq        128
Name: StnPressure, dtype: object


count      2944
unique      102
top       30.00
freq         96
Name: SeaLevel, dtype: object


count     2944
unique     178
top        6.9
freq        63
Name: AvgSpeed, dtype: object


--------------------------------------------------------------------------------
********************    End of the summary of text data     ********************
--------------------------------------------------------------------------------

    def numeric(self, clean=True):
        """Convert the features with partial numeric records to numeric type.

        Parameters
        ----------
        clean : boolean, default True
            If True, cleaning all row entries where NaN found.
            If False, no cleaning of row entries with NaN found. 
        """
        colnames = self.df.columns
        temp = self.df.apply(pd.to_numeric, args=('coerce',))
        removed = []
        for x in colnames:
            # Check columns which have all values as NaN
            if (temp[x].isnull().values.all()):
                removed.append(x)

        colnames = colnames.drop(removed)
        # Replace the text values with numeric values
        self.df[colnames] = temp[colnames]

        # Remove all row entries if presence of NaN found
        if clean:
            self.df = self.df.dropna()

# Convert the data to numeric type and remove NaN entries             
weather.numeric()
weather.summarize()

--------------------------------------------------------------------------------
********************    Begin of the summary of text data   ********************
--------------------------------------------------------------------------------
count           1294
unique          1294
top       2008-10-30
freq               1
Name: Date, dtype: object


count     1294
unique      62
top           
freq       730
Name: CodeSum, dtype: object


count     1294
unique       1
top          M
freq      1294
Name: Water1, dtype: object


--------------------------------------------------------------------------------
********************    End of the summary of text data     ********************
--------------------------------------------------------------------------------

Jing Qiang Goh

Dealing with Missing Data