ModelSanitizer exports the sanitize! function and the Model, Data, and
ForceSanitize structs.
If your model is stored in m and your data are stored in x1,
x2, x3, etc. then you can sanitize your model with:
sanitize!(Model(m), Data(x1), Data(x2), Data(x3), ...)This will recursively search inside the model m for anything that resembles
your data and will delete the data that it finds.
If you happen to know exactly where inside a model the data are stored, you
can explicitly tell ModelSanitizer to delete those data. If your model is
stored in m, and you know that the fields m.x1, m.x2, m.x3, etc. contain
data that needs to be removed, you can force ModelSanitizer to delete those
data with:
sanitize!(ForceSanitize(m.x1), ForceSanitize(m.x2), ForceSanitize(m.x3), ...)julia> using ModelSanitizer
julia> using Statistics
julia> using Test
julia> mutable struct LinearModel{T}
X::Matrix{T}
y::Vector{T}
beta::Vector{T}
function LinearModel{T}()::LinearModel{T} where T
m::LinearModel{T} = new()
return m
end
end
julia> function fit!(m::LinearModel{T}, X::Matrix{T}, y::Vector{T})::LinearModel{T} where T
m.X = deepcopy(X)
m.y = deepcopy(y)
m.beta = beta = (m.X'm.X)\(m.X'm.y)
return m
end
fit! (generic function with 1 method)
julia> function predict(m::LinearModel{T}, X::Matrix{T})::Vector{T} where T
y_hat::Vector{T} = X * m.beta
return y_hat
end
predict (generic function with 1 method)
julia> function predict(m::LinearModel{T})::Vector{T} where T
X::Matrix{T} = m.X
y_hat::Vector{T} = predict(m, X)
return y_hat
end
predict (generic function with 2 methods)
julia> function mse(y::Vector{T}, y_hat::Vector{T})::T where T
_mse::T = mean((y .- y_hat).^2)
return _mse
end
mse (generic function with 1 method)
julia> function mse(m::LinearModel{T}, X::Matrix{T}, y::Vector{T})::T where T
y_hat::Vector{T} = predict(m, X)
_mse::T = mse(y, y_hat)
return _mse
end
mse (generic function with 2 methods)
julia> function mse(m::LinearModel{T})::T where T
X::Matrix{T} = m.X
y::Vector{T} = m.y
_mse::T = mse(m, X, y)
return _mse
end
mse (generic function with 3 methods)
julia> rmse(varargs...) = sqrt(mse(varargs...))
rmse (generic function with 1 method)
julia> function r2(y::Vector{T}, y_hat::Vector{T})::T where T
y_bar::T = mean(y)
SS_tot::T = sum((y .- y_bar).^2)
SS_res::T = sum((y .- y_hat).^2)
_r2::T = 1 - SS_res/SS_tot
return _r2
end
r2 (generic function with 1 method)
julia> function r2(m::LinearModel{T}, X::Matrix{T}, y::Vector{T})::T where T
y_hat::Vector{T} = predict(m, X)
_r2::T = r2(y, y_hat)
return _r2
end
r2 (generic function with 2 methods)
julia> function r2(m::LinearModel{T})::T where T
X::Matrix{T} = m.X
y::Vector{T} = m.y
_r2::T = r2(m, X, y)
return _r2
end
r2 (generic function with 3 methods)
julia> X = randn(Float64, 5_000, 14)
5000×14 Array{Float64,2}:
0.0956436 0.481324 -0.796437 … -2.26483 1.57243 -1.65105
-0.306527 -0.880146 -0.764714 -0.182449 -0.0767462 -0.939232
-0.223116 -0.408068 0.728855 0.220045 0.785533 0.49013
-0.336363 1.46187 -1.17633 -0.955872 0.699277 0.587961
0.628275 0.208697 -0.522714 0.116233 0.47314 0.435968
-0.12303 -0.964061 0.919518 … -0.0230613 -1.12379 -0.439892
1.06664 0.96542 -0.250164 -0.776266 1.70851 -1.08608
0.957151 0.850486 1.31718 0.497219 1.01069 -0.558217
-0.206168 -0.608305 -0.864631 0.969031 0.209796 1.28718
-0.658039 1.20687 1.33288 1.54847 0.546286 -1.00404
-0.598782 -0.193289 0.673134 … -1.59742 0.410881 -1.61342
0.31442 0.0199012 0.50533 1.0889 -0.0713841 -1.29933
0.236585 -1.09804 0.945631 -0.729247 -1.10004 -0.339332
0.122913 0.619345 -2.90947 1.09613 -0.662693 -1.03469
1.52615 0.942471 0.262139 0.223064 0.665103 1.4081
-0.474543 1.9466 -0.408505 … 1.01626 -0.297397 -0.0953909
0.73664 -0.0796424 -1.84864 1.15935 0.0164378 1.32191
0.24588 0.271068 -0.238212 0.596475 1.52617 -0.747777
⋮ ⋱
-1.07141 0.194049 -0.350011 -0.666195 0.481406 -0.451329
-0.00993413 0.33006 -0.985443 -0.0395822 2.36983 -0.793007
0.610014 -0.509744 -1.06447 … 1.19769 1.129 0.397217
0.785654 -0.361031 0.314127 0.192215 0.789262 0.725731
0.258588 -2.06379 0.511611 0.0963516 -1.01919 -0.540021
0.48671 -0.918205 0.264124 0.989929 2.45245 -1.39545
-1.27085 -0.0617834 2.59491 0.291602 1.28642 0.236496
1.4044 -1.24472 -0.205029 … 1.99366 -1.58951 0.963728
-1.07691 0.44178 -0.602841 0.584759 -0.887116 1.36514
1.13586 0.954756 0.44016 -2.21191 -1.14086 -0.585916
-0.763031 -1.13348 -1.46696 -1.4121 -0.977694 -0.618883
0.875367 -1.30925 0.183117 0.224709 0.0752964 -0.92173
0.659502 0.71971 -1.05538 … -0.912277 -0.736332 1.01404
-0.809941 2.02362 1.29668 0.113623 -0.858281 0.0863472
-1.6409 0.310551 -0.235102 -1.11232 -0.170224 0.404804
-0.367908 -1.9062 0.245953 -0.751821 -0.794633 0.00894607
0.380897 2.30871 -0.669909 0.282513 -0.114725 -0.253537
julia> y = X * randn(Float64, 14) + randn(5_000)
5000-element Array{Float64,1}:
-4.418867382994752
1.0721553534178543
2.210545604666476
-2.5053994409702094
2.24399399066432
0.5993702994926247
2.2040361967638322
-2.4902628750358193
4.184644001244288
1.7688752332135804
-4.831550352023476
-1.068149084362266
-0.746260929030723
0.032933800577055417
2.878202216460962
2.773804353610833
1.0288912118472482
3.7799578982964963
⋮
3.1797791441997822
5.830717537973503
-0.8191545280972992
4.649281267724443
0.9470989605451162
5.733118456044454
3.057352206232011
4.791267454465988
-4.604222639675081
-5.755448165821573
-0.9804279159155482
2.2904285226467276
2.809999802793834
0.7773010780323945
-2.5205742651574
3.8866539005621092
-4.085889556008112
julia> m = LinearModel{Float64}()
LinearModel{Float64}(#undef, #undef, #undef)
julia> testing_rows = 1:2:5_000
1:2:4999
julia> training_rows = setdiff(1:5_000, testing_rows)
2500-element Array{Int64,1}:
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
⋮
4968
4970
4972
4974
4976
4978
4980
4982
4984
4986
4988
4990
4992
4994
4996
4998
5000
julia> fit!(m, X[training_rows, :], y[training_rows])
LinearModel{Float64}([-0.306527 -0.880146 … -0.0767462 -0.939232; -0.336363 1.46187 … 0.699277 0.587961; … ; -1.6409 0.310551 … -0.170224 0.404804; 0.380897 2.30871 … -0.114725 -0.253537], [1.07216, -2.5054, 0.59937, -2.49026, 1.76888, -1.06815, 0.0329338, 2.7738, 3.77996, -4.06727 … 2.81088, 3.17978, -0.819155, 0.947099, 3.05735, -4.60422, -0.980428, 2.81, -2.52057, -4.08589], [-0.532213, -1.16489, -0.414974, -0.562536, -0.440432, 0.732505, -1.06754, 0.399485, -0.67281, -1.44599, 0.835625, 0.426459, 1.20088, 0.754435])
julia> @test m.X == X[training_rows, :]
Test Passed
julia> @test m.y == y[training_rows]
Test Passed
julia> @test all(m.X .== X[training_rows, :])
Test Passed
julia> @test all(m.y .== y[training_rows])
Test Passed
julia> @test !all(m.X .== 0)
Test Passed
julia> @test !all(m.y .== 0)
Test Passed
julia> # before sanitization, we can make predictions
predict(m, X[testing_rows, :])
2500-element Array{Float64,1}:
-4.513253714187381
2.5689035333536605
0.9939782906365846
1.2513894159362184
3.2007086601687353
-5.387968774216589
-0.1767892797746935
3.4408813711668165
0.4625821018811823
1.649129884116436
-0.8620887900500149
0.6504970487658756
4.287913533796443
-2.5014166099065136
1.1666979326633855
0.2723098985354143
3.2783930370766634
2.250636815003683
⋮
1.1999638265752477
3.8377489399901084
4.2805489451765935
-0.5849048693472063
-0.6574890049656816
0.2606368302418087
-4.197310605534758
-3.5805273324146336
-0.5244747588662737
5.274904154193373
2.7742388165636953
5.883741172337488
2.118699747786167
-4.209943069147431
2.262361580682631
-0.5044151513387216
4.443422779093501
julia> predict(m, X[training_rows, :])
2500-element Array{Float64,1}:
2.943212508610099
-0.8226863248850258
1.031068845178503
-3.3178919274576053
0.587046578244962
-0.032251634503744686
1.9123819046207888
3.555603804394087
2.1728937544760307
-1.9319447549669504
-0.7592148524301295
-7.250437603426189
4.982277986708986
-1.8660967909674548
0.29423182806971415
0.593840341165224
-0.26314562641917977
1.4340414682799685
⋮
1.6038174714835796
1.3091787016871341
4.936123830680592
1.9812183495287048
-0.848632475032059
3.1553721781769157
-5.412240178264108
1.406559298117795
3.6433312336276646
0.3408165307792135
0.2882242203753349
1.8120206189755343
-3.299798877655878
-0.8793971451160698
2.3158119962568886
-2.4598360012327265
-4.810128269819875
julia> @show mse(m, X[training_rows, :], y[training_rows])
mse(m, X[training_rows, :], y[training_rows]) = 0.9856973993855034
0.9856973993855034
julia> @show rmse(m, X[training_rows, :], y[training_rows])
rmse(m, X[training_rows, :], y[training_rows]) = 0.9928229446308658
0.9928229446308658
julia> @show r2(m, X[training_rows, :], y[training_rows])
r2(m, X[training_rows, :], y[training_rows]) = 0.9044357103305194
0.9044357103305194
julia> @show mse(m, X[testing_rows, :], y[testing_rows])
mse(m, X[testing_rows, :], y[testing_rows]) = 0.9480778102674918
0.9480778102674918
julia> @show rmse(m, X[testing_rows, :], y[testing_rows])
rmse(m, X[testing_rows, :], y[testing_rows]) = 0.9736928726592856
0.9736928726592856
julia> @show r2(m, X[testing_rows, :], y[testing_rows])
r2(m, X[testing_rows, :], y[testing_rows]) = 0.9088387716983182
0.9088387716983182
julia> sanitize!(Model(m), Data(X), Data(y)) # sanitize the model with ModelSanitizer
Model{LinearModel{Float64}}(LinearModel{Float64}([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [-0.532213, -1.16489, -0.414974, -0.562536, -0.440432, 0.732505, -1.06754, 0.399485, -0.67281, -1.44599, 0.835625, 0.426459, 1.20088, 0.754435]))
julia> @test m.X != X[training_rows, :]
Test Passed
julia> @test m.y != y[training_rows]
Test Passed
julia> @test !all(m.X .== X[training_rows, :])
Test Passed
julia> @test !all(m.y .== y[training_rows])
Test Passed
julia> @test all(m.X .== 0)
Test Passed
julia> @test all(m.y .== 0)
Test Passed
julia> # after sanitization, we are still able to make predictions
predict(m, X[testing_rows, :])
2500-element Array{Float64,1}:
-4.513253714187381
2.5689035333536605
0.9939782906365846
1.2513894159362184
3.2007086601687353
-5.387968774216589
-0.1767892797746935
3.4408813711668165
0.4625821018811823
1.649129884116436
-0.8620887900500149
0.6504970487658756
4.287913533796443
-2.5014166099065136
1.1666979326633855
0.2723098985354143
3.2783930370766634
2.250636815003683
⋮
1.1999638265752477
3.8377489399901084
4.2805489451765935
-0.5849048693472063
-0.6574890049656816
0.2606368302418087
-4.197310605534758
-3.5805273324146336
-0.5244747588662737
5.274904154193373
2.7742388165636953
5.883741172337488
2.118699747786167
-4.209943069147431
2.262361580682631
-0.5044151513387216
4.443422779093501
julia> predict(m, X[training_rows, :])
2500-element Array{Float64,1}:
2.943212508610099
-0.8226863248850258
1.031068845178503
-3.3178919274576053
0.587046578244962
-0.032251634503744686
1.9123819046207888
3.555603804394087
2.1728937544760307
-1.9319447549669504
-0.7592148524301295
-7.250437603426189
4.982277986708986
-1.8660967909674548
0.29423182806971415
0.593840341165224
-0.26314562641917977
1.4340414682799685
⋮
1.6038174714835796
1.3091787016871341
4.936123830680592
1.9812183495287048
-0.848632475032059
3.1553721781769157
-5.412240178264108
1.406559298117795
3.6433312336276646
0.3408165307792135
0.2882242203753349
1.8120206189755343
-3.299798877655878
-0.8793971451160698
2.3158119962568886
-2.4598360012327265
-4.810128269819875
julia> @show mse(m, X[training_rows, :], y[training_rows])
mse(m, X[training_rows, :], y[training_rows]) = 0.9856973993855034
0.9856973993855034
julia> @show rmse(m, X[training_rows, :], y[training_rows])
rmse(m, X[training_rows, :], y[training_rows]) = 0.9928229446308658
0.9928229446308658
julia> @show r2(m, X[training_rows, :], y[training_rows])
r2(m, X[training_rows, :], y[training_rows]) = 0.9044357103305194
0.9044357103305194
julia> @show mse(m, X[testing_rows, :], y[testing_rows])
mse(m, X[testing_rows, :], y[testing_rows]) = 0.9480778102674918
0.9480778102674918
julia> @show rmse(m, X[testing_rows, :], y[testing_rows])
rmse(m, X[testing_rows, :], y[testing_rows]) = 0.9736928726592856
0.9736928726592856
julia> @show r2(m, X[testing_rows, :], y[testing_rows])
r2(m, X[testing_rows, :], y[testing_rows]) = 0.9088387716983182
0.9088387716983182
julia> # if you know exactly where the data are stored inside the model, you can
# directly delete them with ForceSanitize:
sanitize!(ForceSanitize(m.X), ForceSanitize(m.y))
(ForceSanitize{Array{Float64,2}}([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]), ForceSanitize{Array{Float64,1}}([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
julia> # we can still make predictions even after using ForceSanitize
predict(m, X[testing_rows, :])
2500-element Array{Float64,1}:
-4.513253714187381
2.5689035333536605
0.9939782906365846
1.2513894159362184
3.2007086601687353
-5.387968774216589
-0.1767892797746935
3.4408813711668165
0.4625821018811823
1.649129884116436
-0.8620887900500149
0.6504970487658756
4.287913533796443
-2.5014166099065136
1.1666979326633855
0.2723098985354143
3.2783930370766634
2.250636815003683
⋮
1.1999638265752477
3.8377489399901084
4.2805489451765935
-0.5849048693472063
-0.6574890049656816
0.2606368302418087
-4.197310605534758
-3.5805273324146336
-0.5244747588662737
5.274904154193373
2.7742388165636953
5.883741172337488
2.118699747786167
-4.209943069147431
2.262361580682631
-0.5044151513387216
4.443422779093501
julia> predict(m, X[training_rows, :])
2500-element Array{Float64,1}:
2.943212508610099
-0.8226863248850258
1.031068845178503
-3.3178919274576053
0.587046578244962
-0.032251634503744686
1.9123819046207888
3.555603804394087
2.1728937544760307
-1.9319447549669504
-0.7592148524301295
-7.250437603426189
4.982277986708986
-1.8660967909674548
0.29423182806971415
0.593840341165224
-0.26314562641917977
1.4340414682799685
⋮
1.6038174714835796
1.3091787016871341
4.936123830680592
1.9812183495287048
-0.848632475032059
3.1553721781769157
-5.412240178264108
1.406559298117795
3.6433312336276646
0.3408165307792135
0.2882242203753349
1.8120206189755343
-3.299798877655878
-0.8793971451160698
2.3158119962568886
-2.4598360012327265
-4.810128269819875
julia> @show mse(m, X[training_rows, :], y[training_rows])
mse(m, X[training_rows, :], y[training_rows]) = 0.9856973993855034
0.9856973993855034
julia> @show rmse(m, X[training_rows, :], y[training_rows])
rmse(m, X[training_rows, :], y[training_rows]) = 0.9928229446308658
0.9928229446308658
julia> @show r2(m, X[training_rows, :], y[training_rows])
r2(m, X[training_rows, :], y[training_rows]) = 0.9044357103305194
0.9044357103305194
julia> @show mse(m, X[testing_rows, :], y[testing_rows])
mse(m, X[testing_rows, :], y[testing_rows]) = 0.9480778102674918
0.9480778102674918
julia> @show rmse(m, X[testing_rows, :], y[testing_rows])
rmse(m, X[testing_rows, :], y[testing_rows]) = 0.9736928726592856
0.9736928726592856
julia> @show r2(m, X[testing_rows, :], y[testing_rows])
r2(m, X[testing_rows, :], y[testing_rows]) = 0.9088387716983182
0.9088387716983182