Highly flexible and efficient computation of n-dimensional binned statistic(s) for n-variable(s)
BinStatistics provides the binstats function that is build on top of DataFrames.jl
and CatagoricalArrays.jl
binstats is 2X-10X faster than Python's scipy-1.8.0
Expect breaking changes as this package is under active development
"""
    binstats(df, axis_col, axis_edges, bin_col; 
        grp_function = [nrow], col_function = [mean], missing_bin = false)
    
Returns a DataFrame containing function values for binned variables of `df`.
# Arguments
- `axis_col`: binning axes column(s)
- `axis_edges`: bin edges for `axis_col`
- `bin_col`: column variable(s) to be binned
- `grp_function = [nrow]`: column independent funciton(s) to be applied at group level
- `var_function = [mean]`: column dependent funciton(s) to be applied to `bin_col` at group level
- `missing_bins = false`: include missing bins
"""using Pkg
Pkg.add("BinStatistics")
Pkg.add("DataFrames")
Pkg.add("Statistics")
Pkg.add("CairoMakie")
using BinStatistics
using DataFrames
using Statistics
using CairoMakiebegin
    n = 1000000;
    df = DataFrame();
    df.x = rand(n).*20;
    df.y = rand(n).*20;
    df.v1 = cos.(df.x) .+ randn(n)*3;
    df.v2 = cos.(df.x .- df.y) .+ sin.(df.x .+ df.y) .+ randn(n)*3;
    df.v3 = df.v1 .+ df.v2;
enddf1 = binstats(df, :x, 0:0.1:20, :v1)
200×3 DataFrame
 Row │ x        nrow   v1_mean  
     │ Float64  Int64  Float64  
─────┼──────────────────────────
   1 │    0.05   4932  0.957416
   2 │    0.15   4922  0.966772
  ⋮  │    ⋮       ⋮       ⋮
 199 │   19.85   5085  0.56495
 200 │   19.95   4958  0.491761
 NOTE: `x` labels are bin centersdf2 = binstats(df, :x, 0:0.1:20, ["v1", "v2"])
200×4 DataFrame
 Row │ x        nrow   v1_mean   v2_mean   
     │ Float64  Int64  Float64   Float64   
─────┼─────────────────────────────────────
   1 │    0.05   4932  0.957416  0.0521698
   2 │    0.15   4922  0.966772  0.134747
  ⋮  │    ⋮       ⋮       ⋮          ⋮
 199 │   19.85   5085  0.56495   0.0731969
 200 │   19.95   4958  0.491761  0.113065df3 = binstats(df, :x, 0:0.1:20, :v1; col_function = [mean, median, std])
200×5 DataFrame
 Row │ x        nrow   v1_mean   v1_median  v1_std  
     │ Float64  Int64  Float64   Float64    Float64 
─────┼──────────────────────────────────────────────
   1 │    0.05   4932  0.957416   1.01216   2.94134
   2 │    0.15   4922  0.966772   0.990715  2.95307
  ⋮  │    ⋮       ⋮       ⋮          ⋮         ⋮
 199 │   19.85   5085  0.56495    0.617968  3.00214
 200 │   19.95   4958  0.491761   0.487893  2.9561df4 = binstats(df, [:y, :x], [0:.2:20, 0:.2:20], [:v2]; missing_bins = true)
10000×4 DataFrame
   Row │ y        x        nrow   v2_mean 
       │ Float64  Float64  Int64  Float64 
───────┼──────────────────────────────────
     1 │     0.1      0.1    102  1.0629
     2 │     0.1      0.3     87  1.46221
   ⋮   │    ⋮        ⋮       ⋮       ⋮
  9999 │    19.9     19.7     96  1.80224
 10000 │    19.9     19.9     94  2.40527df5 = binstats(df, [:y, :x], [(0:0.5:4.5).^2, (0:0.5:4.5).^2], [:v2], grp_function = [], col_function = [median], missing_bins = true)
81×3 DataFrame
 Row │ y        x        v2_median   
     │ Float64  Float64  Float64     
─────┼───────────────────────────────
   1 │   0.125    0.125   0.94437
   2 │   0.125    0.625   1.79481
  ⋮  │    ⋮        ⋮          ⋮
  80 │  18.125   14.125  -0.00643648
  81 │  18.125   18.125   0.00196411# create a median absolute deviation function
function mad(x)
    median(abs.(x .- median(x))) 
end
# binstats also accepts anonymous functions but the output will be assinged a generic name
# apply to grouped data
df6 = binstats(df, [:y, :x], [0:1:20, 0:1:20], [:v2], grp_function = [], col_function = [mad],; missing_bins = true)
400×3 DataFrame
 Row │ y        x        v2_mad  
     │ Float64  Float64  Float64 
─────┼───────────────────────────
   1 │     0.5      0.5  2.04322
   2 │     0.5      1.5  2.08714
  ⋮  │    ⋮        ⋮        ⋮
 399 │    19.5     18.5  2.17078
 400 │    19.5     19.5  2.02198# Example 1
begin 
    fig = Figure()
    Axis(fig[1, 1], title = "raw data")
    scatter!(fig[1, 1], df.x, df.v1)
    Axis(fig[1, 2], title = "binned data")
    scatter!(fig[1, 2], df1[:,1], df1.v1_mean)  
    fig
end
# Example 2
begin
    fig = Figure()
    Axis(fig[1, 1], title = "raw data")
    scatter!(fig[1, 1], df.x, df.v1)
    scatter!(fig[1, 1], df.x, df.v2)
    Axis(fig[1, 2], title = "binned data")
    scatter!(fig[1, 2], df2[:,1], df2.v1_mean, label = "v1")
    scatter!(fig[1, 2], df2[:,1], df2.v2_mean, label = "v2")
    axislegend()
    fig
end
# Example 3
begin
    fig = Figure()
    Axis(fig[1, 1], title = "raw data")
    scatter!(fig[1, 1], df.x, df.v1)
    Axis(fig[1, 2], title = "binned data")
    scatter!(fig[1, 2], df3[:,1], df3.v1_mean, label = "mean")
    scatter!(fig[1, 2], df3[:,1], df3.v1_median, label = "median")
    scatter!(fig[1, 2], df3[:,1], df3.v1_std, label = "std")
    axislegend()
    fig
end
# Example 4
begin 
    fig = Figure()
    Axis(fig[1, 1], title = "raw data")
    scatter!(fig[1, 1], df.y, df.x, color = df.v2, colormap = :thermal, markersize = 1)
    xlims!(0, 20); ylims!(0, 20)
    Axis(fig[1, 2], title = "binned data")
    heatmap!(fig[1, 2], unique(df4[:,1]),unique(df4[:,2]), 
        reshape(df4.v2_mean,length(unique(df4[:,2])),length(unique(df4[:,1]))), 
        colormap = :thermal)
    fig
end
# Example 5
begin 
    fig = Figure()
    Axis(fig[1, 1], title = "raw data")
    scatter!(fig[1, 1], df.y, df.x, color = df.v2, colormap = :thermal, markersize = 1)
    xlims!(0, 20); ylims!(0, 20)
    Axis(fig[1, 2], title = "binned data")
    heatmap!(fig[1, 2], unique(df5[:,1]),unique(df5[:,2]),
        reshape(df5.v2_median,length(unique(df5[:,2])),length(unique(df5[:,1]))), colormap = :thermal)
    fig
end
# Example 6
begin 
    fig = Figure()
    Axis(fig[1, 1], title = "raw data")
    scatter!(fig[1, 1], df.y, df.x, color = df.v2, colormap = :thermal, markersize = 1)
    xlims!(0, 20); ylims!(0, 20)
    Axis(fig[1, 2], title = "binned data")
    heatmap!(fig[1, 2], unique(df6[:,1]),unique(df6[:,2]), 
        reshape(df6.v2_mad,length(unique(df6[:,2])),length(unique(df6[:,1]))), 
        colormap = :thermal)
    fig
end
BinnedStatistics.jl for single variable 1-D binned statistics
Scipy's binned_statistic, binned_statistic_2d, and binned_statistic_dd for single variable 1-, 2-, and n-dimensional binned statistics





