Winsorizing

The function winsorize tries to emulate stata winsor function.

There is a winsor function in StatsBase.jl but I think it's a little less full-featured.

Basic usage

Start with a simple distribution to visualize the effect of winsorizing

Random.seed!(3); x = randn(10_000);
p1 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution",
    framestyle=:box, size=(1250,750))

Replace the outliers based on quantile

x_win = winsorize(x, probs=(0.05, 0.95));
p2 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box);
histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized")

One side trim

x_win = winsorize(x, probs=(0, 0.8));
p3 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box);
histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");

Bring your own cutpoints

Another type of winsorizing is to specify your own cutpoints (they do not have to be symmetric):

x_win = winsorize(x, cutpoints=(-1.96, 2.575));
p4 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box);
histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");

Rely on the computer to select the right cutpoints

If you do not specify either they will specified automatically

x_win = winsorize(x; verbose=true);
p5 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box);
histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");
[ Info: Inferred cutpoints are ... (-4.073837032137298, 4.019734075131403) (using interquartile range x 3 from median)

How not to replace outliers

If you do not want to replace the value by the cutoffs, specify replace_value=missing:

x_win = winsorize(x, cutpoints=(-2.575, 1.96), replace_value=missing);
p6 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box);
histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");

How to choose your replacement

The replace_value command gives you some flexibility to do whatever you want in your outlier data transformation

x_win = winsorize(x, cutpoints=(-2.575, 1.96), replace_value=(-1.96, 1.28));
p7 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box);
histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");

Within a DataFrame

I try to mimick the gtools winsor example

Winsorize one variable

df = DataFrame(PalmerPenguins.load())

# gstats winsor wage
transform!(df, :body_mass_g => (x -> winsorize(x, probs=(0.1, 0.9)) ) => :body_mass_g_w)

p8 = histogram(df.body_mass_g, bins=2700:100:6300, color="blue", label="distribution", framestyle=:box);
histogram!(df.body_mass_g_w, bins=2700:100:6300, color="red", opacity=0.5, label="winsorized");

Winsorize multiple variables

# gstats winsor wage age hours, cuts(0.5 99.5) replace
var_to_winsorize = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm"]
transform!(df,
    var_to_winsorize .=> (x -> winsorize(x, probs=(0.1, 0.9)) ) .=> var_to_winsorize .* "_w")
show(IOContext(stdout, :limit => true, :displaysize => (20, 100)),
    select(df, :species, :island, :bill_length_mm, :bill_length_mm_w,
               :bill_depth_mm, :bill_depth_mm_w, :flipper_length_mm, :flipper_length_mm_w),
    allcols=true, allrows=false)
344×8 DataFrame
 Row │ species    island     bill_length_mm  bill_length_mm_w  bill_depth_mm  bill_depth_mm_w  flipper_length_mm  flipper_length_mm_w
     │ String15   String15   Float64?        Float64?          Float64?       Float64?         Int64?             Union…?
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │ Adelie     Torgersen            39.1              39.1           18.7             18.7                181                185.0
   2 │ Adelie     Torgersen            39.5              39.5           17.4             17.4                186                186
   3 │ Adelie     Torgersen            40.3              40.3           18.0             18.0                195                195
   4 │ Adelie     Torgersen       missing           missing        missing          missing              missing            missing
   5 │ Adelie     Torgersen            36.7              36.7           19.3             19.3                193                193
   6 │ Adelie     Torgersen            39.3              39.3           20.6             19.5                190                190
  ⋮  │     ⋮          ⋮            ⋮                ⋮                ⋮               ⋮                 ⋮                   ⋮
 340 │ Chinstrap  Dream                55.8              50.8           19.8             19.5                207                207
 341 │ Chinstrap  Dream                43.5              43.5           18.1             18.1                202                202
 342 │ Chinstrap  Dream                49.6              49.6           18.2             18.2                193                193
 343 │ Chinstrap  Dream                50.8              50.8           19.0             19.0                210                210
 344 │ Chinstrap  Dream                50.2              50.2           18.7             18.7                198                198
                                                                                                                      333 rows omitted

Winsorize on one side only

# left-winsorizing only, at 1th percentile;
# cap noi gstats winsor wage, cuts(1 100); gstats winsor wage, cuts(1 100) s(_w2)
transform!(df, :body_mass_g => (x -> winsorize(x, probs=(0.1, 1)) ) => :body_mass_g_w )
show(IOContext(stdout, :limit => true, :displaysize => (20, 100)),
    select(df, :species, :island, :body_mass_g, :body_mass_g_w),
    allcols=true, allrows=false)
344×4 DataFrame
 Row │ species    island     body_mass_g  body_mass_g_w
     │ String15   String15   Int64?       Union…?
─────┼──────────────────────────────────────────────────
   1 │ Adelie     Torgersen         3750         3750
   2 │ Adelie     Torgersen         3800         3800
   3 │ Adelie     Torgersen         3250         3300.0
   4 │ Adelie     Torgersen      missing      missing
   5 │ Adelie     Torgersen         3450         3450
   6 │ Adelie     Torgersen         3650         3650
  ⋮  │     ⋮          ⋮           ⋮             ⋮
 340 │ Chinstrap  Dream             4000         4000
 341 │ Chinstrap  Dream             3400         3400
 342 │ Chinstrap  Dream             3775         3775
 343 │ Chinstrap  Dream             4100         4100
 344 │ Chinstrap  Dream             3775         3775
                                        333 rows omitted

Winsorize by groups

transform!(
    groupby(df, :sex),
    :body_mass_g => (x -> winsorize(x, probs=(0.2, 0.8)) ) => :body_mass_g_w)
p9 = histogram(df[ isequal.(df.sex, "male"), :body_mass_g], bins=3000:100:6300,
    color="blue", label="distribution", framestyle=:box);
histogram!(df[ isequal.(df.sex, "male"), :body_mass_g_w], bins=3000:100:6300,
    color="red", opacity=0.5, label="winsorized");