Skip to content

Commit b9e32e3

Browse files
committed
rejigged data pre-processing
1 parent a4baa56 commit b9e32e3

1 file changed

Lines changed: 9 additions & 19 deletions

File tree

docs/src/paper/data_preprocessing/_real_world_data.qmd

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,17 @@ using MLUtils: undersample
4848
# Make DataFrames.jl work
4949
MLUtils.getobs(data::DataFrame, i) = data[i,:]
5050
MLUtils.numobs(data::DataFrame) = nrow(data)
51-
df_balanced = getobs(undersample(df, df.target;shuffle=true))
51+
df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]
5252
```
5353

5454
```{julia}
55-
CSV.write(joinpath(data_path, "cal_housing.csv"), df)
55+
CSV.write(joinpath(data_path, "cal_housing.csv"), df_balanced)
5656
```
5757

5858

5959
## Give Me Some Credit
6060

6161
```{julia}
62-
using CSV, DataFrames, Statistics, StatsBase
6362
df = CSV.read(joinpath(data_path, "raw/cs-training.csv"), DataFrame)
6463
select!(df, Not([:Column1]))
6564
rename!(df, :SeriousDlqin2yrs => :target)
@@ -68,7 +67,7 @@ dropmissing!(df)
6867
mapcols!(x -> eltype(x) <: AbstractString ? parse.(Int, x) : x, df)
6968
# Features:
7069
X = Matrix(df[:,Not(:target)])
71-
dt = fit(ZScoreTransform, X, dims=1)
70+
dt = StatsBase.fit(ZScoreTransform, X, dims=1)
7271
StatsBase.transform!(dt, X)
7372
# Target:
7473
y = df.target
@@ -83,7 +82,7 @@ using MLUtils: undersample
8382
# Make DataFrames.jl work
8483
MLUtils.getobs(data::DataFrame, i) = data[i,:]
8584
MLUtils.numobs(data::DataFrame) = nrow(data)
86-
df_balanced = getobs(undersample(df, df.target;shuffle=true))
85+
df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]
8786
```
8887

8988
```{julia}
@@ -93,30 +92,21 @@ CSV.write(joinpath(data_path, "gmsc.csv"), df_balanced)
9392
## UCI Credit Card Default
9493

9594
```{julia}
96-
using CSV, DataFrames, Statistics, StatsBase
9795
df = CSV.read(joinpath(data_path, "raw/UCI_Credit_Card.csv"), DataFrame)
98-
select!(df, Not([:ID, :SEX, :EDUCATION, :MARRIAGE]))
96+
select!(df, Not([:ID]))
9997
rename!(df, "default.payment.next.month" => :target)
10098
dropmissing!(df)
99+
df.SEX = categorical(df.SEX)
100+
df.EDUCATION = categorical(df.EDUCATION)
101+
df.MARRIAGE = categorical(df.MARRIAGE)
101102
mapcols!(x -> eltype(x) <: AbstractString ? parse.(Int, x) : x, df)
102-
# Features:
103-
X = Matrix(df[:,Not(:target)])
104-
dt = fit(ZScoreTransform, X, dims=1)
105-
StatsBase.transform!(dt, X)
106-
# Target:
107-
y = df.target
108-
# Data:
109-
df = DataFrame(X,:auto)
110-
df.target = y
111103
```
112104

113105
```{julia}
114-
using MLUtils
115-
using MLUtils: undersample
116106
# Make DataFrames.jl work
117107
MLUtils.getobs(data::DataFrame, i) = data[i,:]
118108
MLUtils.numobs(data::DataFrame) = nrow(data)
119-
df_balanced = getobs(undersample(df, df.target;shuffle=true))
109+
df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]
120110
```
121111

122112
```{julia}

0 commit comments

Comments
 (0)