The cheapest threads you can find!
Popularity
169 Stars
Updated Last
12 Months Ago
Started In
February 2021

# Polyester

Note that `Polyester.@batch` moves arrays to threads by turning them into StrideArraysCore.PtrArrays. This means that under an `@batch` slices will create `view`s by default, and that you may also need to start Julia with `--check-bounds=yes` while debugging.

Polyester.jl provides low overhead threading. The primary API is `@batch`, which can be used in place of `Threads.@threads`. The number of available threads is still governed by `--threads` or `JULIA_NUM_THREADS`, as reported by `Threads.nthreads()`. Lets look at a simple benchmark.

```using Polyester, LinearAlgebra, BenchmarkHistograms
function axpy_serial!(y, a, x)
@inbounds for i in eachindex(y,x)
y[i] = muladd(a, x[i], y[i])
end
end
# One thread per core, the default (the threads are not pinned)
function axpy_per_core!(y, a, x)
@batch per=core for i in eachindex(y,x)
y[i] = muladd(a, x[i], y[i])
end
end
function axpy_per_thread!(y, a, x)
@batch per=thread for i in eachindex(y,x)
y[i] = muladd(a, x[i], y[i])
end
end
# Set a minimum batch size of `200`
function axpy_minbatch!(y, a, x)
@batch minbatch=2000 for i in eachindex(y,x)
y[i] = muladd(a, x[i], y[i])
end
end
function axpy_atthread!(y, a, x)
@inbounds y[i] = muladd(a, x[i], y[i])
end
end

y = rand(10_000);
x = rand(10_000);
@benchmark axpy_serial!(\$y, eps(), \$x)
@benchmark axpy!(eps(), \$x, \$y)
@benchmark axpy_atthread!(\$y, eps(), \$x)
@benchmark axpy_per_core!(\$y, eps(), \$x)
@benchmark axpy_per_thread!(\$y, eps(), \$x)
@benchmark axpy_minbatch!(\$y, eps(), \$x)
versioninfo()```

With only `10_000` elements, this simply `muladd` loop can't afford the overhead of threads like `BLAS` or `Threads.@threads`, they just slow the computations down. But these 10_000 elements can afford `Polyester`, giving up to a >2x speedup on 4 cores.

```julia> @benchmark axpy_serial!(\$y, eps(), \$x)
samples: 10000; evals/sample: 10; memory estimate: 0 bytes; allocs estimate: 0
ns

(1160.0 - 1240.0]  ██████████████████████████████ 9573
(1240.0 - 1320.0]  █306
(1320.0 - 1390.0]  ▎53
(1390.0 - 1470.0]  ▏25
(1470.0 - 1550.0]   0
(1550.0 - 1620.0]   0
(1620.0 - 1700.0]   0
(1700.0 - 1780.0]   0
(1780.0 - 1860.0]   0
(1860.0 - 1930.0]   0
(1930.0 - 2010.0]   0
(2010.0 - 2090.0]   0
(2090.0 - 2160.0]   0
(2160.0 - 2240.0]  ▏32
(2240.0 - 3230.0]  ▏11

Counts

min: 1.161 μs (0.00% GC); mean: 1.182 μs (0.00% GC); median: 1.169 μs (0.00% GC); max: 3.226 μs (0.00% GC).

julia> @benchmark axpy!(eps(), \$x, \$y)
samples: 10000; evals/sample: 9; memory estimate: 0 bytes; allocs estimate: 0
ns

(2030.0 - 2160.0]  ██████████████████████████████ 9415
(2160.0 - 2300.0]  █▋497
(2300.0 - 2430.0]  ▎49
(2430.0 - 2570.0]  ▏5
(2570.0 - 2700.0]   0
(2700.0 - 2840.0]  ▏1
(2840.0 - 2970.0]   0
(2970.0 - 3110.0]   0
(3110.0 - 3240.0]  ▏1
(3240.0 - 3370.0]   0
(3370.0 - 3510.0]   0
(3510.0 - 3640.0]  ▏1
(3640.0 - 3780.0]   0
(3780.0 - 3910.0]  ▏21
(3910.0 - 4880.0]  ▏10

Counts

min: 2.030 μs (0.00% GC); mean: 2.060 μs (0.00% GC); median: 2.039 μs (0.00% GC); max: 4.881 μs (0.00% GC).

julia> @benchmark axpy_atthread!(\$y, eps(), \$x)
samples: 10000; evals/sample: 7; memory estimate: 3.66 KiB; allocs estimate: 41
ns

(3700.0  - 4600.0  ]  ██████████████████████████████▏7393
(4600.0  - 5500.0  ]  ███▌852
(5500.0  - 6400.0  ]  ██████▍1556
(6400.0  - 7300.0  ]  ▊175
(7300.0  - 8200.0  ]  ▏7
(8200.0  - 9100.0  ]  ▏3
(9100.0  - 10000.0 ]   0
(10000.0 - 10900.0 ]   0
(10900.0 - 11800.0 ]  ▏1
(11800.0 - 12800.0 ]   0
(12800.0 - 13700.0 ]  ▏1
(13700.0 - 14600.0 ]   0
(14600.0 - 15500.0 ]   0
(15500.0 - 16400.0 ]  ▏1
(16400.0 - 880700.0]  ▏11

Counts

min: 3.662 μs (0.00% GC); mean: 4.909 μs (6.36% GC); median: 4.226 μs (0.00% GC); max: 880.721 μs (93.63% GC).

julia> @benchmark axpy_per_core!(\$y, eps(), \$x)
samples: 10000; evals/sample: 194; memory estimate: 0 bytes; allocs estimate: 0
ns

(496.0 - 504.0 ]  ██████████████████████████████ 5969
(504.0 - 513.0 ]  ██████████████████3564
(513.0 - 522.0 ]  ██▏420
(522.0 - 531.0 ]  ▏9
(531.0 - 539.0 ]  ▏4
(539.0 - 548.0 ]  ▏1
(548.0 - 557.0 ]  ▏7
(557.0 - 565.0 ]  ▏3
(565.0 - 574.0 ]  ▏2
(574.0 - 583.0 ]   0
(583.0 - 591.0 ]  ▏1
(591.0 - 600.0 ]  ▏4
(600.0 - 609.0 ]  ▏3
(609.0 - 617.0 ]  ▏2
(617.0 - 1181.0]  ▏11

Counts

min: 495.758 ns (0.00% GC); mean: 505.037 ns (0.00% GC); median: 503.884 ns (0.00% GC); max: 1.181 μs (0.00% GC).

julia> @benchmark axpy_per_thread!(\$y, eps(), \$x)
samples: 10000; evals/sample: 181; memory estimate: 0 bytes; allocs estimate: 0
ns

(583.0 - 611.0 ]  ██████████████████████████████ 8489
(611.0 - 640.0 ]  █████▎1453
(640.0 - 669.0 ]  ▏21
(669.0 - 697.0 ]  ▏12
(697.0 - 726.0 ]  ▏5
(726.0 - 755.0 ]  ▏2
(755.0 - 783.0 ]  ▏2
(783.0 - 812.0 ]  ▏1
(812.0 - 841.0 ]   0
(841.0 - 869.0 ]   0
(869.0 - 898.0 ]  ▏1
(898.0 - 927.0 ]   0
(927.0 - 955.0 ]   0
(955.0 - 984.0 ]  ▏3
(984.0 - 9088.0]  ▏11

Counts

min: 582.608 ns (0.00% GC); mean: 609.063 ns (0.00% GC); median: 606.028 ns (0.00% GC); max: 9.088 μs (0.00% GC).

julia> @benchmark axpy_minbatch!(\$y, eps(), \$x)
samples: 10000; evals/sample: 195; memory estimate: 0 bytes; allocs estimate: 0
ns

(484.0 - 514.0 ]  ██████████████████████████████9874
(514.0 - 544.0 ]  ▎43
(544.0 - 574.0 ]  ▏24
(574.0 - 604.0 ]  ▏18
(604.0 - 634.0 ]  ▏13
(634.0 - 664.0 ]  ▏2
(664.0 - 694.0 ]  ▏1
(694.0 - 724.0 ]  ▏1
(724.0 - 754.0 ]   0
(754.0 - 784.0 ]  ▏8
(784.0 - 814.0 ]   0
(814.0 - 844.0 ]   0
(844.0 - 874.0 ]  ▏2
(874.0 - 904.0 ]  ▏3
(904.0 - 3364.0]  ▏11

Counts

min: 484.082 ns (0.00% GC); mean: 502.104 ns (0.00% GC); median: 499.708 ns (0.00% GC); max: 3.364 μs (0.00% GC).

julia> versioninfo()
Julia Version 1.7.0-DEV.1150
Commit a08a3ff1f9* (2021-05-22 21:10 UTC)
Platform Info:
OS: Linux (x86_64-redhat-linux)
CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.0 (ORCJIT, tigerlake)
Environment:

The `minbatch` argument lets us choose a minimum number of iterations per thread. That is, `minbatch=n` means it'll use at most `number_loop_iterations ÷ n` threads. Setting `minbatch=2000` like we did above means that with only 4000 iterations, `@batch` will use just 2 threads; with 3999 iterations, it'll only only 1. This lets us control the pace with which it ramps up threads. By using only 2 threads with 4000 iterations, it is still much faster than the serial version, while using 4 threads (`per=core`) it is only slightly faster, and the full 8 (`per=thread`) matches serial.

```julia> x = rand(4_000); y = rand(4_000);

julia> @benchmark axpy_serial!(\$y, eps(), \$x)
samples: 10000; evals/sample: 196; memory estimate: 0 bytes; allocs estimate: 0
ns

(477.0 - 484.0]  ██████1379
(484.0 - 491.0]  ██████████████████████████████ 6931
(491.0 - 499.0]  ████▍1004
(499.0 - 506.0]  ▍71
(506.0 - 513.0]  ▏28
(513.0 - 520.0]  ▎47
(520.0 - 528.0]  ▎45
(528.0 - 535.0]  ▏20
(535.0 - 542.0]  ██443
(542.0 - 549.0]  ▏15
(549.0 - 557.0]  ▏3
(557.0 - 564.0]   0
(564.0 - 571.0]   0
(571.0 - 578.0]  ▏3
(578.0 - 858.0]  ▏11

Counts

min: 476.867 ns (0.00% GC); mean: 490.402 ns (0.00% GC); median: 488.444 ns (0.00% GC); max: 858.056 ns (0.00% GC).

julia> @benchmark axpy_minbatch!(\$y, eps(), \$x)
samples: 10000; evals/sample: 276; memory estimate: 0 bytes; allocs estimate: 0
ns

(287.0 - 297.0]  ██████████████████▌2510
(297.0 - 306.0]  ██████████████████████████████ 4088
(306.0 - 316.0]  ███████████████████████▋3205
(316.0 - 325.0]  █▎158
(325.0 - 335.0]  ▎24
(335.0 - 344.0]   0
(344.0 - 354.0]  ▏1
(354.0 - 364.0]   0
(364.0 - 373.0]   0
(373.0 - 383.0]   0
(383.0 - 392.0]   0
(392.0 - 402.0]   0
(402.0 - 411.0]  ▏1
(411.0 - 421.0]  ▏2
(421.0 - 689.0]  ▏11

Counts

min: 286.938 ns (0.00% GC); mean: 302.339 ns (0.00% GC); median: 299.721 ns (0.00% GC); max: 689.467 ns (0.00% GC).

julia> @benchmark axpy_per_core!(\$y, eps(), \$x)
samples: 10000; evals/sample: 213; memory estimate: 0 bytes; allocs estimate: 0
ns

(344.0 - 351.0]  █▋325
(351.0 - 359.0]  ██████████████████████████████ 6026
(359.0 - 366.0]  ████████████████▏3229
(366.0 - 373.0]  █▋321
(373.0 - 381.0]  ▍55
(381.0 - 388.0]  ▏12
(388.0 - 396.0]  ▏7
(396.0 - 403.0]  ▏6
(403.0 - 410.0]  ▏1
(410.0 - 418.0]   0
(418.0 - 425.0]   0
(425.0 - 433.0]  ▏1
(433.0 - 440.0]  ▏5
(440.0 - 447.0]  ▏1
(447.0 - 795.0]  ▏11

Counts

min: 343.770 ns (0.00% GC); mean: 357.972 ns (0.00% GC); median: 357.270 ns (0.00% GC); max: 794.709 ns (0.00% GC).

julia> @benchmark axpy_per_thread!(\$y, eps(), \$x)
samples: 10000; evals/sample: 195; memory estimate: 0 bytes; allocs estimate: 0
ns

(476.0 - 487.0 ]  ██████████████████████████████▏7273
(487.0 - 499.0 ]  ██████████▉2625
(499.0 - 510.0 ]  ▎48
(510.0 - 522.0 ]  ▏15
(522.0 - 533.0 ]  ▏6
(533.0 - 545.0 ]  ▏2
(545.0 - 557.0 ]  ▏5
(557.0 - 568.0 ]  ▏5
(568.0 - 580.0 ]  ▏3
(580.0 - 591.0 ]  ▏2
(591.0 - 603.0 ]   0
(603.0 - 614.0 ]   0
(614.0 - 626.0 ]  ▏3
(626.0 - 638.0 ]  ▏2
(638.0 - 2489.0]  ▏11

Counts

min: 475.564 ns (0.00% GC); mean: 486.650 ns (0.00% GC); median: 485.287 ns (0.00% GC); max: 2.489 μs (0.00% GC).```

Seeing that we still see a substantial improvement with 2 threads for vectors of length 4000, we may thus expect to still see improvement for vectors of length 3000, and could thus try `minbatch=1_500`. That'd also ensure we're still using just 2 threads for vectos of length 4000. However, performance with respect to size tends to have a lot discontinuities.

```julia> function axpy_minbatch_1500!(y, a, x)
@batch minbatch=1_500 for i in eachindex(y,x)
y[i] = muladd(a, x[i], y[i])
end
end
axpy_minbatch_1500! (generic function with 1 method)

julia> x = rand(3_000); y = rand(3_000);

julia> @benchmark axpy_serial!(\$y, eps(), \$x)
samples: 10000; evals/sample: 839; memory estimate: 0 bytes; allocs estimate: 0
ns

(145.3 - 151.6]  ██████████████████████████████9289
(151.6 - 157.9]  ▌133
(157.9 - 164.3]  █▋484
(164.3 - 170.6]  ▏14
(170.6 - 176.9]   0
(176.9 - 183.3]  ▏2
(183.3 - 189.6]  ▏9
(189.6 - 195.9]  ▏6
(195.9 - 202.2]  ▏6
(202.2 - 208.6]  ▏5
(208.6 - 214.9]  ▏4
(214.9 - 221.2]  ▏9
(221.2 - 227.6]  ▏14
(227.6 - 233.9]  ▏14
(233.9 - 260.2]  ▏11

Counts

min: 145.273 ns (0.00% GC); mean: 148.314 ns (0.00% GC); median: 145.881 ns (0.00% GC); max: 260.150 ns (0.00% GC).

julia> @benchmark axpy_minbatch!(\$y, eps(), \$x)
samples: 10000; evals/sample: 807; memory estimate: 0 bytes; allocs estimate: 0
ns

(148.6 - 153.6]  ██████████████████████████████ 8937
(153.6 - 158.7]  ██▍674
(158.7 - 163.8]  █292
(163.8 - 168.9]  ▎71
(168.9 - 174.0]  ▏4
(174.0 - 179.0]  ▏3
(179.0 - 184.1]   0
(184.1 - 189.2]   0
(189.2 - 194.3]   0
(194.3 - 199.3]   0
(199.3 - 204.4]   0
(204.4 - 209.5]  ▏1
(209.5 - 214.6]   0
(214.6 - 219.6]  ▏7
(219.6 - 742.4]  ▏11

Counts

min: 148.572 ns (0.00% GC); mean: 152.167 ns (0.00% GC); median: 152.376 ns (0.00% GC); max: 742.447 ns (0.00% GC).

julia> @benchmark axpy_minbatch_1500!(\$y, eps(), \$x)
samples: 10000; evals/sample: 233; memory estimate: 0 bytes; allocs estimate: 0
ns

(317.7 - 323.9]  ▍43
(323.9 - 330.2]  ████▉591
(330.2 - 336.4]  ████████████████████▉2538
(336.4 - 342.6]  ██████████████████████████████3669
(342.6 - 348.8]  ██████████████████▉2299
(348.8 - 355.0]  █████▌667
(355.0 - 361.2]  █▏129
(361.2 - 367.4]  ▎21
(367.4 - 373.6]  ▏13
(373.6 - 379.8]  ▏5
(379.8 - 386.0]  ▏2
(386.0 - 392.2]  ▏2
(392.2 - 398.4]  ▏5
(398.4 - 404.6]  ▏5
(404.6 - 791.4]  ▏11

Counts

min: 317.738 ns (0.00% GC); mean: 339.868 ns (0.00% GC); median: 339.279 ns (0.00% GC); max: 791.361 ns (0.00% GC).```

By reducing the length of the vectors by just 1/3 (4000 -> 3000), we saw over a 3.5x speedup in the serial version. `minbatch=2000`, by also using only a single thread was able to match its performance. Thus, something around `minbatch=2000` seems like the best choice for this particular function on this particular CPU.

Note that `@batch` defaults to using up to one thread per physical core, instead of 1 thread per CPU thread. This is because LoopVectorization.jl currently only uses up to 1 thread per physical core, and switching the number of threads incurs some overhead. See the docstring on `@batch` (i.e., `?@batch` in a Julia REPL) for some more discussion.

## Local per-thread storage

You also can define local storage for each thread, providing a vector containing each of the local storages at the end.

```julia> let
@batch threadlocal=rand(10:99) for i in 0:9
end
end

index 8, thread 1, local storage 30
index 3, thread 3, local storage 49
index 9, thread 1, local storage 31
index 6, thread 4, local storage 33
index 0, thread 2, local storage 14
index 4, thread 3, local storage 50
index 7, thread 4, local storage 34
index 1, thread 2, local storage 15
index 5, thread 3, local storage 51
index 2, thread 2, local storage 16
Any[32, 17, 52, 35]```

Optionally, a type can be specified for the thread-local storage:

```julia> let
@batch threadlocal=rand(10:99)::Float16 for i in 0:9
end
end

Float16[83.0, 90.0, 27.0, 65.0]```

## Disabling Polyester threads

When running many repetitions of a Polyester-multithreaded function (e.g. in an embarrassingly parallel problem that repeatedly executes a small already Polyester-multithreaded function), it can be beneficial to disable Polyester (the inner multithreaded loop) and multithread only at the outer level (e.g. with `Base.Threads`). This can be done with the `disable_polyester_threads` context manager. In the expandable section below you can see examples with benchmarks.

It is best to call `disable_polyester_threads` only once, before any `@thread` uses happen, to avoid overhead. E.g. best to do it as:

```disable_polyester_threads() do
@threads for i in 1:n
f()
end
end```

instead of doing it in the following unnecessarily slow manner:

```@threads for i in 1:n # DO NOT DO THIS
disable_polyester_threads() do # IT HAS UNNECESSARY OVERHEAD
f()
end
end```
Benchmarks of nested multi-threading with Polyester
```# Big inner problem, repeated only a few times

y = rand(10000000,4);
x = rand(size(y)...);

@btime inner(\$x,\$y,1) # 73.319 ms (0 allocations: 0 bytes)
@btime inner_polyester(\$x,\$y,1) # 8.936 ms (0 allocations: 0 bytes)
@btime inner_thread(\$x,\$y,1) # 11.206 ms (49 allocations: 4.56 KiB)

@btime sequential_sequential(\$x,\$y) # 274.926 ms (0 allocations: 0 bytes)
@btime sequential_polyester(\$x,\$y) # 36.963 ms (0 allocations: 0 bytes)
@btime sequential_thread(\$x,\$y) # 49.373 ms (196 allocations: 18.25 KiB)

@btime threads_of_polyester(\$x,\$y) # 78.828 ms (58 allocations: 4.84 KiB)
# the following is a purposefully suboptimal way to disable threads
@btime threads_of_polyester_inner_disable(\$x,\$y) # 70.182 ms (47 allocations: 4.50 KiB)
# the following is a good way to disable threads (the disable call happening once in the outer scope)
@btime Polyester.disable_polyester_threads() do; threads_of_polyester(\$x,\$y) end; # 71.141 ms (47 allocations: 4.50 KiB)
@btime threads_of_sequential(\$x,\$y) # 70.857 ms (46 allocations: 4.47 KiB)
@btime threads_of_thread(\$x,\$y) # 45.116 ms (219 allocations: 22.00 KiB)

# Small inner problem, repeated many times

y = rand(1000,1000);
x = rand(size(y)...);

@btime inner(\$x,\$y,1) # 7.028 μs (0 allocations: 0 bytes)
@btime inner_polyester(\$x,\$y,1) # 1.917 μs (0 allocations: 0 bytes)
@btime inner_thread(\$x,\$y,1) # 7.544 μs (45 allocations: 4.44 KiB)

@btime sequential_sequential(\$x,\$y) # 6.790 ms (0 allocations: 0 bytes)
@btime sequential_polyester(\$x,\$y) # 2.070 ms (0 allocations: 0 bytes)
@btime sequential_thread(\$x,\$y) # 9.296 ms (49002 allocations: 4.46 MiB)

@btime threads_of_polyester(\$x,\$y) # 2.090 ms (42 allocations: 4.34 KiB)
# the following is a purposefully suboptimal way to disable threads
@btime threads_of_polyester_inner_disable(\$x,\$y) # 1.065 ms (42 allocations: 4.34 KiB)
# the following is a good way to disable threads (the disable call happening once in the outer scope)
@btime Polyester.disable_polyester_threads() do; threads_of_polyester(\$x,\$y) end; # 997.918 μs (49 allocations: 4.56 KiB)
@btime threads_of_sequential(\$x,\$y) # 1.057 ms (48 allocations: 4.53 KiB)
@btime threads_of_thread(\$x,\$y) # 4.105 ms (42059 allocations: 4.25 MiB)

# The tested functions
# All of these would be better implemented by just using @tturbo,
# but these suboptimal implementations serve as good test case for
# Polyster-vs-Base thread scheduling.

function inner(x,y,j)
for i ∈ axes(x,1)
y[i,j] = sin(x[i,j])
end
end

function inner_polyester(x,y,j)
@batch for i ∈ axes(x,1)
y[i,j] = sin(x[i,j])
end
end

@threads for i ∈ axes(x,1)
y[i,j] = sin(x[i,j])
end
end

function sequential_sequential(x,y)
for j ∈ axes(x,2)
inner(x,y,j)
end
end

function sequential_polyester(x,y)
for j ∈ axes(x,2)
inner_polyester(x,y,j)
end
end

for j ∈ axes(x,2)
end
end

@threads for j ∈ axes(x,2)
inner_polyester(x,y,j)
end
end

# XXX This is a bad way to disable Polyester threads as
# See the benchmarks above for a better way.
@threads for j ∈ axes(x,2)
inner_polyester(x,y,j)
end
end
end

@threads for j ∈ axes(x,2)
end
end

@threads for j ∈ axes(x,2)
end
end

@threads for j ∈ axes(x,2)
inner(x,y,j)
end
end```

Benchmarks executed on:

``````Julia Version 1.9.0-DEV.998
Commit e1739aa42a1 (2022-07-18 10:27 UTC)
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 16 × AMD Ryzen 7 1700 Eight-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.5 (ORCJIT, znver1)
Threads: 8 on 16 virtual cores
Environment:
JULIA_EDITOR = code