@@ -27,70 +27,103 @@ if not os.path.isdir(os.path.join(data_path,"raw")):
2727df.to_csv(os.path.join(data_path," raw/cal_housing.csv" ), index = False )
2828```
2929
30- Loading the data into Julia session:
30+ Loading the data into Julia session.
3131
3232``` {julia}
3333df = CSV.read(joinpath(data_path, "raw/cal_housing.csv"), DataFrame)
34- # Features:
35- X = Matrix(df[:,Not(:target)])
36- dt = StatsBase.fit(ZScoreTransform, X, dims=1)
37- StatsBase.transform!(dt, X)
34+ # # Features:
35+ # X = Matrix(df[:,Not(:target)])
36+ # dt = StatsBase.fit(ZScoreTransform, X, dims=1)
37+ # StatsBase.transform!(dt, X)
38+ # df = DataFrame(X,:auto)
3839# Target:
3940y = df.target
40- y = Float64.(y .>= median(y)); # binary target
41+ y = Float64.(y .>= median(y)); # binary target (positive outcome)
4142# Data:
42- df = DataFrame(X,:auto)
4343df.target = y
4444```
4545
46+ Random undersampling to balance the data:
47+
4648``` {julia}
47- using MLUtils: undersample
48- # Make DataFrames.jl work
49- MLUtils.getobs(data::DataFrame, i) = data[i,:]
50- MLUtils.numobs(data::DataFrame) = nrow(data)
5149df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]
5250```
5351
52+ All features are continuous:
53+
54+ ``` {julia}
55+ schema(df_balanced)
56+ ```
57+
58+ Turning the data into ` CounterfactualData ` :
59+
5460``` {julia}
55- CSV.write(joinpath(data_path, "cal_housing.csv"), df_balanced)
61+ X = Matrix(df_balanced[:,Not(:target)])
62+ X = permutedims(X)
63+ y = permutedims(df_balanced.target)
64+ data = CounterfactualData(X,y)
65+ ```
66+
67+ Saving the data:
68+
69+ ``` {julia}
70+ CSV.write(joinpath(data_path, "cal_housing.csv"), df_balanced) # binary file
71+ Serialization.serialize(joinpath(data_path,"cal_housing.jls"), data) # CounterfactualData
5672```
5773
5874
5975## Give Me Some Credit
6076
77+ Loading and basic preprocessing:
78+
6179``` {julia}
6280df = CSV.read(joinpath(data_path, "raw/cs-training.csv"), DataFrame)
6381select!(df, Not([:Column1]))
6482rename!(df, :SeriousDlqin2yrs => :target)
6583mapcols!(x -> [ifelse(x_=="NA", missing, x_) for x_ in x], df)
6684dropmissing!(df)
6785mapcols!(x -> eltype(x) <: AbstractString ? parse.(Int, x) : x, df)
68- # Features:
69- X = Matrix(df[:,Not(:target)])
70- dt = StatsBase.fit(ZScoreTransform, X, dims=1)
71- StatsBase.transform!(dt, X)
86+ # # Features:
87+ # X = Matrix(df[:,Not(:target)])
88+ # dt = StatsBase.fit(ZScoreTransform, X, dims=1)
89+ # StatsBase.transform!(dt, X)
90+ # df = DataFrame(X,:auto)
7291# Target:
73- y = df.target
74- # Data:
75- df = DataFrame(X,:auto)
76- df.target = y
92+ df.target .= map(y -> y == 0 ? 1 : 0, df.target) # postive outcome = no delinquency
7793```
7894
95+ Balancing:
96+
7997``` {julia}
80- using MLUtils
81- using MLUtils: undersample
82- # Make DataFrames.jl work
83- MLUtils.getobs(data::DataFrame, i) = data[i,:]
84- MLUtils.numobs(data::DataFrame) = nrow(data)
8598df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]
8699```
87100
101+ All features are continuous:
102+
88103``` {julia}
89- CSV.write(joinpath(data_path, "gmsc.csv"), df_balanced)
104+ schema(df_balanced)
105+ ```
106+
107+ Turning the data into ` CounterfactualData ` :
108+
109+ ``` {julia}
110+ X = Matrix(df_balanced[:,Not(:target)])
111+ X = permutedims(X)
112+ y = permutedims(df_balanced.target)
113+ data = CounterfactualData(X,y)
114+ ```
115+
116+ Saving:
117+
118+ ``` {julia}
119+ CSV.write(joinpath(data_path, "gmsc.csv"), df_balanced) # binary file
120+ Serialization.serialize(joinpath(data_path,"gmsc.jls"), data) # CounterfactualData
90121```
91122
92123## UCI Credit Card Default
93124
125+ Loading and basic preprocessing:
126+
94127``` {julia}
95128df = CSV.read(joinpath(data_path, "raw/UCI_Credit_Card.csv"), DataFrame)
96129select!(df, Not([:ID]))
@@ -100,17 +133,57 @@ df.SEX = categorical(df.SEX)
100133df.EDUCATION = categorical(df.EDUCATION)
101134df.MARRIAGE = categorical(df.MARRIAGE)
102135mapcols!(x -> eltype(x) <: AbstractString ? parse.(Int, x) : x, df)
136+ df.target .= map(y -> y == 0 ? 1 : 0, df.target) # postive outcome = no default
103137```
104138
139+ Balancing:
140+
105141``` {julia}
106- # Make DataFrames.jl work
107- MLUtils.getobs(data::DataFrame, i) = data[i,:]
108- MLUtils.numobs(data::DataFrame) = nrow(data)
109142df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]
110143```
111144
145+ ** Not** all features are continuous:
146+
147+ ``` {julia}
148+ schema(df_balanced)
149+ ```
150+
151+ One-hot encoding:
152+
153+ ``` {julia}
154+ hot = OneHotEncoder()
155+ mach = MLJBase.fit!(machine(hot, df_balanced))
156+ df_balanced = MLJBase.transform(mach, df_balanced)
157+ schema(df_balanced)
158+ ```
159+
160+ Categorical indices:
161+
162+ ``` {julia}
163+ features_categorical = [
164+ [2,3],
165+ collect(4:10),
166+ collect(11:14)
167+ ]
168+ ```
169+
170+ Preparing for use with ` CounterfactualExplanations.jl ` :
171+
172+ ``` {julia}
173+ X = Matrix(df_balanced[:,Not(:target)])
174+ X = permutedims(X)
175+ y = permutedims(df_balanced.target)
176+ data = CounterfactualData(
177+ X, y;
178+ features_categorical = features_categorical
179+ )
180+ ```
181+
182+ Saving:
183+
112184``` {julia}
113- CSV.write(joinpath(data_path, "credit_default.csv"), df_balanced)
185+ CSV.write(joinpath(data_path, "credit_default.csv"), df_balanced) # binary file
186+ Serialization.serialize(joinpath(data_path,"credit_default.jls"), data) # CounterfactualData
114187```
115188
116189
0 commit comments