Data-Science-and-ML-for-Structured-Data-Classification/data_preprocess.py at main · jabhinav/Data-Science-and-ML-for-Structured-Data-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from feature_engine import discretisers as dsc
import pandas as pd


class data_preprocess:

    def __init__(self, orig_data, label_col=None,
                 categorical_encoding='nominal', drop_columns=None):

        # Index of label columns should be computed after drop_columns are removed from the df
        # [orig_data.columns.get_loc(col) for col in drop_columns]
        self.drop_columns = drop_columns if drop_columns else []
        if label_col is None:
            label_col = orig_data.columns[-1]
        self.label_col = label_col
        self.orig_df = orig_data
        self.categorical_encoding = categorical_encoding
        self.y = None
        self.processed_df = None

        # Identify categorical/discrete/continuous features
        # make a list of categorical variables
        self.categorical = [
            var for var in orig_data.columns if orig_data[var].dtype == 'O' and (var not in drop_columns
                                                                                 and var != self.label_col)
        ]
        # make a list of numerical variables
        self.numerical = [
            var for var in orig_data.columns if orig_data[var].dtype != 'O' and (var not in drop_columns
                                                                                 and var != self.label_col)
        ]
        # From numerical make a list of discrete variables
        self.discrete = [var for var in self.numerical if len(orig_data[var].unique()) < 20]
        # continuous variables
        self.continuous = [var for var in self.numerical if var not in self.discrete]

    # def __call__(self, onehot_encode=None):
    #     return self.process_data(onehot_encode)


    @staticmethod
    def _drop_na(df, label_col):
        # Label column na filtering
        df = df[df[label_col].notna()]
        # Rows NA filtering
        df = df.dropna(axis=0,
                       how="all",
                       thresh=None,
                       subset=None,
                       inplace=False)
        # Columns NA filtering
        df = df.dropna(axis=1,
                       how="all",
                       thresh=None,
                       subset=None,
                       inplace=False)
        return df

    def get_df_X_y(self, df, label_column, onehot_encode=[], column_label_dict={}):
        if onehot_encode is None:
            onehot_encode = []
        df_Y = df[[label_column]]
        df_X = df.loc[:, df.columns != label_column]
        # char_cols = df_X.dtypes.pipe(lambda x: x[x == "object"]).index
        char_cols = self.categorical

        # Missing Data Imputation
        df_X[char_cols] = df_X[char_cols].fillna("")  # Will replace empty categorical feature values with "" (new cat)
        for col in self.numerical:
            df_X[col] = df_X[col].fillna(df_X[col].median())  # or use .mode()[0]
        # df_X.fillna(df_X.median(), inplace=True)# Since numerical columns are left with na, will fill them with median

        # One hot encoding of categorical variables that do not possess ordinal relationship.
        # If `columns` is None then all the columns with object or category dtype will be converted.
        # Nan or empty cells are provided separate 0-1 indicator.
        # onehot_encode = []
        if onehot_encode:
            onehot_encode = list(set(char_cols).intersection(onehot_encode))
            df_X = pd.get_dummies(df_X, prefix=[col[:4] for col in onehot_encode], columns=onehot_encode,
                                  prefix_sep='_', drop_first=False)

        # For each column, store its labels and use numbers to represent them (factorise)
        label_mapping = {}
        for c in char_cols:
            if c not in onehot_encode:
                # Factorise based on custom provided column label encoding
                # If label maps are provided for ordinal variables use that else use pre-defined method
                if c in column_label_dict.keys():
                    df_X[c].replace(column_label_dict[c], inplace=True)
                    label_mapping[c] = column_label_dict[c]
                else:
                    df_X[c], label_mapping[c] = pd.factorize(df_X[c], sort=True)

        label_cols = df_Y.dtypes.pipe(lambda x: x).index
        label_label_mapping = {}
        for c in label_cols:
            df_Y[c], label_label_mapping[c] = pd.factorize(df_Y[c])
        num_label_mapping = {
            num: label
            for num, label in enumerate(label_label_mapping[label_column])
        }
        # print("Category Label Mapping: {}".format(label_mapping))
        # print("Label Encoding: {}\n".format(num_label_mapping))

        # One hot encoding of categorical variables that do not possess ordinal relationship.
        # (handle_unknown='ignore'): If an unknown category is encountered during transform, the resulting one-hot
        # encoded columns for this feature will be all zeros .
        # if onehot_encode == "all_cat":
        #     # one hot encode cat features only
        #     ct = ColumnTransformer([('o', OneHotEncoder(handle_unknown='ignore'), self.categorical)],
        #                            remainder='passthrough')
        #     X = ct.fit_transform(df_X)
        # elif isinstance(onehot_encode, List):
        #     for feature in onehot_encode:
        #         if feature in self.categorical:
        #             pass
        #         else:
        #             print("Provided feature for one-hot not found in given dataset")
        #             sys.exit(0)
        #     ct = ColumnTransformer([('o', OneHotEncoder(), onehot_encode)], remainder='passthrough')
        #     X = ct.fit_transform(df_X)
        # else:
        #     X = df_X.values
        X = df_X.values
        y = df_Y.values.T[0]

        # Min-Max Normalization
        min_max_scaler = preprocessing.MinMaxScaler()
        X = min_max_scaler.fit_transform(X)

        self.y = df_Y
        self.processed_df = df_X
        return X, y

    def process_data(self):
        df = self.orig_df
        if self.drop_columns:
            df = df.drop(self.drop_columns, axis=1)
        # Drop empty columns/rows
        df = self._drop_na(df, self.label_col)
        return df


    @staticmethod
    def discretise_X(X, n_bins):
        bins = [n_bins for _ in range(X.shape[-1])]
        est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='kmeans').fit(X)
        X = est.transform(X)
        return X

    def discretise_using_decision_trees(self):
        # set up the discretisation transformer
        disc = dsc.DecisionTreeDiscretiser(
            cv=3,
            scoring='neg_mean_squared_error',  # Since, we are discretizing cont variables, use MSE to create splits
            variables=self.continuous,
            param_grid={'max_depth': [1, 2, 3, 4, 5]},
            regression=False)
        # fit the transformer
        disc.fit(self.processed_df, self.y)
        # transform and return the data
        discretized_X = disc.transform(self.processed_df)
        return discretized_X.values


"""
For categorical variables, treating nan(s) as another value of the variables is a reasonable approach.
For numerical variables, we will use missing-value imputation.

Note: Numerical Variables which have discrete values exhibit ordinal relationship if variable is treated as categorical.
The discrete numerical values of a feature are inherently ordered and thus, need no encoding.
"""