``````using BenchmarkTools

versioninfo(verbose=true)
``````

### HWloc

``````using Pkg
Pkg.add("Hwloc")

using Hwloc
Hwloc.num_physical_cores()
``````

###Starting Threads

``````using Base.Threads
nthreads()

threadid()

a=zeros(4)
for i in 1:nthreads()
a[threadid()] = threadid()
end
a
``````

### The @Threads macro

``````    @threads for i in 1:nthreads()
a[threadid()] = threadid()
end
``````

#### Prefix sum

``````function prefix_threads!(y)
l=length(y)
k=ceil(Int, log2(l))
for j=1:k
@threads for i=2^j:2^j:min(l, 2^k)
@inbounds y[i] = y[i-2^(j-1)] + y[i]
end
end
for j=(k-1):-1:1
@threads for i=3*2^(j-1):2^j:min(l, 2^k)
@inbounds y[i] = y[i-2^(j-1)] + y[i]
end
end
y
end
``````

## Thread Safety

### Monte Carlo

``````using Random
function darts_in_circle(n, rng=Random.GLOBAL_RNG)
inside = 0
for i in 1:n
if rand(rng)^2 + rand(rng)^2 < 1
inside += 1
end
end
return inside
end

function pi_serial(n)
return 4 * darts_in_circle(n) / n
end

using Base.Threads

const rnglist = [MersenneTwister() for i in 1:nthreads()];

function pi_threads(n, loops)
inside = zeros(Int, loops)
@threads for i in 1:loops
rng = rnglist[threadid()]
inside[threadid()] = darts_in_circle(n, rng)
end
return 4 * sum(inside) / (n*loops)
end

pi_threads(2_500_000, 4)

@btime pi_serial(10_000_000)

@btime pi_threads(2_500_000, 4)
``````

### Atomics

`````` function sum_thread_base(x)
r = zero(eltype(x))
@threads for i in eachindex(x)
@inbounds r += x[i]
end
return r
end

a=rand(10_000_000);

@btime sum(\$a)

@btime sum_thread_base(\$a)

function sum_thread_atomic(x)
r = Atomic{eltype(x)}(zero(eltype(x)))
@threads for i in eachindex(x)
@inbounds atomic_add!(r, x[i])
end
return r[]
end

@btime sum_thread_atomic(\$a)

function sum_thread_split(A)
r = Atomic{eltype(A)}(zero(eltype(A)))
len, rem = divrem(length(A), nthreads())
#Split the array equally among the threads
@threads for t in 1:nthreads()
r[] = zero(eltype(A))
@simd for i in (1:len) .+ (t-1)*len
@inbounds r[] += A[i]
end
atomic_add!(r, r[])
end
result = r[]
#process up the remaining data
@simd for i in length(A)-rem+1:length(A)
@inbounds result += A[i]
end
return result
end

@btime sum_thread_split(\$a)

@btime sum_thread_split(\$a)
``````

### Synchronisation primitives

``````const f = open(tempname(), "a+")

const mt = Base.Threads.Mutex();

@threads for i in 1:50
r = pi_serial(10_000_000)
lock(mt)
write(f, "From \$(threadid()), pi = \$r\n")
unlock(mt)
end

close(f)

const s = Base.Semaphore(2);
@threads for i in 1:nthreads()
Base.acquire(s)
r = pi_serial(10_000_000)
Core.println("Calculated pi = \$r in Thread \$(threadid())")
Base.release(s)
end
``````

## Threaded libraries

``````a = rand(1000, 1000);

b = rand(1000, 1000);

@btime \$a*\$b;
``````

### Oversubscriptions

``````function matmul_serial(x)
first_num = zeros(length(x))
for i in eachindex(x)
@inbounds first_num[i] = (x[i]'*x[i])[1]
end
return first_num
end

function matmul_thread(x)
first_num = zeros(length(x))
@threads for i in eachindex(x)
@inbounds first_num[i] = (x[i]'*x[i])[1]
end
return first_num
end

m = [rand(1000, 1000) for _ in 1:10];

@btime matmul_serial(m);

@btime matmul_thread(m);

using LinearAlgebra
BLAS.set_num_threads(1)

@btime matmul_thread(m);
``````