Skip to content

Commit

Permalink
WIP: use LoopVectorization
Browse files Browse the repository at this point in the history
Preliminary tests suggest ~4x speedups, not too shabby
  • Loading branch information
timholy committed Sep 23, 2021
1 parent 6b51789 commit 22e7486
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 64 deletions.
6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ author = ["Tim Holy <[email protected]>", "Jan Weidner <[email protected]>"]
version = "0.7.0"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91"
ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
FFTViews = "4f61f5a4-77b1-5117-aa51-3ab5ef4ef0cd"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Expand All @@ -19,16 +21,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
TiledIteration = "06e1c1a7-607b-532d-9fad-de7d9aa2abac"

[compat]
ArrayInterface = "3"
CatIndices = "0.2"
ComputationalResources = "0.3"
DataStructures = "0.17.7, 0.18"
FFTViews = "0.3"
FFTW = "0.3, 1"
ImageCore = "0.9"
LoopVectorization = "0.12.75"
OffsetArrays = "1.9"
Reexport = "1.1"
StaticArrays = "0.10, 0.11, 0.12, 1.0"
TiledIteration = "0.2, 0.3"
TiledIteration = "0.4"
julia = "1"

[extras]
Expand Down
2 changes: 2 additions & 0 deletions src/ImageFiltering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds
using OffsetArrays: IdentityUnitRange # using the one in OffsetArrays makes this work with multiple Julia versions
using SparseArrays # only needed to fix an ambiguity in borderarray
using Reexport
using LoopVectorization
using LoopVectorization.VectorizationBase

@reexport using OffsetArrays: centered # this method once lived here

Expand Down
171 changes: 108 additions & 63 deletions src/imfilter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,10 @@ end
function imfilter!(r::AbstractCPU{FIRTiled{N}}, out::AbstractArray{S,N}, A::AbstractArray{T,N}, kernel::Tuple{Any,Any,Vararg{Any}}, border::NoPad, inds::Indices=axes(out)) where {S,T,N}
kern = kernel[1]
iscopy(kern) && return imfilter!(r, out, A, tail(kernel), border, inds)
tmp = tile_allocate(filter_type(A, kernel), r.settings.tilesize, kernel)
TTile, f = native_eltype(filter_type(A, kernel))
@assert f == 1
# @show r.settings.tilesize
tmp = tile_allocate(TTile, r.settings.tilesize, kernel)
_imfilter_tiled!(r, out, A, kernel, border, tmp, inds)
out
end
Expand Down Expand Up @@ -834,6 +837,7 @@ end

# Single-threaded, pair of kernels (with only one temporary buffer required)
function _imfilter_tiled!(r::CPU1, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
out, A, kernel = maybe_reinterpret(out, A, kernel)
k1, k2 = kernel
tile = tiles[1]
indsk2, indstile = axes(k2), axes(tile)
Expand All @@ -850,6 +854,7 @@ end

# Multithreaded, pair of kernels
function _imfilter_tiled!(r::CPUThreads, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
out, A, kernel = maybe_reinterpret(out, A, kernel)
k1, k2 = kernel
tile = tiles[1]
indsk2, indstile = axes(k2), axes(tile)
Expand Down Expand Up @@ -908,10 +913,8 @@ end
out
end

# The first of the pair in `tmp` has the current data. We also make
# the second a plain array so there's no doubt about who's holding the
# proper indices.
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,Array})
# The first of the pair in `tmp` has the current data.
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,AbstractArray})
tileb1, tile2 = tmp
k1, kt = kernel[1], tail(kernel)
parentinds = axes(tileb1)
Expand All @@ -922,7 +925,7 @@ function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, borde
end

# on the last call we write to `out` instead of one of the buffers
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,Array})
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,AbstractArray})
tileb1 = tmp[1]
k1 = kernel[1]
parentinds = axes(tileb1)
Expand Down Expand Up @@ -1014,26 +1017,26 @@ end
# This is unfortunate, but specializing this saves an add in the inner
# loop and results in a modest performance improvement. It would be
# nice if LLVM did this automatically. (@polly?)
function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
off, k = CartesianIndex(kern.offsets), parent(kern)
o, O = safehead(off), safetail(off)
Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
Rk = CartesianIndices(axes(k))
offA, pA = CartesianIndex(A.offsets), parent(A)
oA, OA = safehead(offA), safetail(offA)
for I in safetail(Rnew)
IA = I-OA
for i in safehead(Rnew)
tmp = z
iA = i-oA
@inbounds for J in safetail(Rk), j in safehead(Rk)
tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
end
@inbounds out[i-o,I-O] = tmp
end
end
out
end
# function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
# off, k = CartesianIndex(kern.offsets), parent(kern)
# o, O = safehead(off), safetail(off)
# Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
# Rk = CartesianIndices(axes(k))
# offA, pA = CartesianIndex(A.offsets), parent(A)
# oA, OA = safehead(offA), safetail(offA)
# for I in safetail(Rnew)
# IA = I-OA
# for i in safehead(Rnew)
# tmp = z
# iA = i-oA
# @inbounds for J in safetail(Rk), j in safehead(Rk)
# tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
# end
# @inbounds out[i-o,I-O] = tmp
# end
# end
# out
# end

function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::ReshapedOneD, border::NoPad, inds)
Rpre, ind, Rpost = iterdims(inds, kern)
Expand All @@ -1043,68 +1046,110 @@ function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::R
return out
end
p = accumfilter(A[first(R)+first(Rk)], first(k))
z = zero(typeof(p+p))
z = float(zero(eltype(A)))
# z = zero(typeof(p+p))
_imfilter_inbounds!(r, z, out, A, k, Rpre, ind, Rpost)
end

# Many of the following are unfortunate specializations
function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
_imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
end
# # Many of the following are unfortunate specializations
# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
# end

function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, koffset=0)
# LoopVectorization.check_type(::Type{T}) where T<:ColorVectorSpace.MathTypes = true
# @generated function VectorizationBase.zero_vecunroll(::StaticInt{N}, ::StaticInt{W}, ::Type{Gray{T}}, ::StaticInt{RS}) where {N,W,T,RS}
# Expr(:block, Expr(:meta, :inline), :(VectorizationBase._vzero(VecUnroll{$(N-1),$W,$T,Vec{$W,$T}}, StaticInt{$RS}())))
# end
# function VectorizationBase._vload_unroll(
# sptr::AbstractStridedPointer{Gray{T},N,C,B}, u::Unroll{AU,F,UN,AV,W,M,UX,I}, ::A, ::StaticInt{RS}, ::StaticInt{X}
# ) where {T<:NativeTypes,N,C,B,AU,F,UN,AV,W,M,UX,I<:IndexNoUnroll,A<:StaticBool,RS,X}
# VectorizationBase.VecUnroll{N,1,T,T}(x::S) where {N,T<:VectorizationBase.NativeTypes,S<:FixedPoint{T}} =
# VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
# VectorizationBase.VecUnroll(x::FixedPoint) = VectorizationBase.VecUnroll(reinterpret(x))
# VectorizationBase.VecUnroll(x::AbstractGray) = VectorizationBase.VecUnroll(gray(x))
# VectorizationBase.VecUnroll(x::Gray) where {N,T<:VectorizationBase.NativeTypes} =
# VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))

const LVTypes = Union{VectorizationBase.NativeTypes, SVector{N,<:VectorizationBase.NativeTypes} where N}

const args = Ref{Any}()
function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray{<:LVTypes}, A::AbstractArray{<:LVTypes}, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
if !LoopVectorization.check_args(out, A)
@show summary(out) summary(A)
args[] = (deepcopy(out), deepcopy(A))
error("this should have worked")
end
indsk = axes(k, 1)
zout = convert(eltype(out), z)
for Ipost in Rpost
for i in ind
ik = i+koffset
for Ipre in Rpre
tmp = z
@turbo for Ipre in Rpre
tmp = zout
for j in indsk
@inbounds tmp += safe_for_prod(A[Ipre,ik+j,Ipost], tmp)*k[j]
tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
end
@inbounds out[Ipre,i,Ipost] = tmp
out[Ipre,i,Ipost] = tmp
end
end
end
out
end

function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
Rpre, ind, Rpost = iterdims(inds, kern)
k = kern.data
R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
if isempty(R) || isempty(Rk)
return out
end
p = accumfilter(A[first(R)+first(Rk)], first(k))
z = zero(typeof(p+p))
Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
_imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
end

function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
_imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
end

function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost, koffset=0)
# No @turbo version
function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
indsk = axes(k, 1)
zout = convert(eltype(out), z)
for Ipost in Rpost
IOpost = Ipost - Opost
for i in ind
io = i-o+koffset
for Ipre in Rpre
IOpre = Ipre - Opre
tmp = z
@inbounds for Ipre in Rpre
tmp = zout
for j in indsk
@inbounds tmp += safe_for_prod(A[IOpre,io+j,IOpost], tmp)*k[j]
tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
end
@inbounds out[Ipre,i,Ipost] = tmp
out[Ipre,i,Ipost] = tmp
end
end
end
out
end
# end unfortunate specializations

# function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
# Rpre, ind, Rpost = iterdims(inds, kern)
# k = kern.data
# R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
# if isempty(R) || isempty(Rk)
# return out
# end
# p = accumfilter(A[first(R)+first(Rk)], first(k))
# z = zero(typeof(p+p))
# Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
# _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
# end

# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
# end

# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
# indsk = axes(k, 1)
# zout = convert(eltype(out), z)
# for Ipost in Rpost
# IOpost = Ipost - Opost
# for i in ind
# io = i-o+koffset
# @turbo for Ipre in Rpre
# IOpre = Ipre - Opre
# tmp = zout
# for j in indsk
# tmp += safe_for_prod(A[IOpre,io+j,IOpost], z)*k[j]
# end
# @inbounds out[Ipre,i,Ipost] = tmp
# end
# end
# end
# out
# end
# # end unfortunate specializations

## commented out because "virtual padding" is commented out
# function _imfilter_iter!(r::AbstractResource, out, padded, kernel::AbstractArray, iter)
Expand Down
3 changes: 3 additions & 0 deletions src/kernelfactors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ for op in (:+, :-, :*, :/)
end
Base.BroadcastStyle(::Type{R}) where {R<:ReshapedOneD{T,N}} where {T,N} = Broadcast.DefaultArrayStyle{N}()

Base.:/(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data / x)
Base.:*(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data * x)

_reshape(A::ReshapedOneD{T,N}, ::Val{N}) where {T,N} = A
_vec(A::ReshapedOneD) = A.data
Base.vec(A::ReshapedOneD) = A.data # is this OK? (note indices won't nec. start with 1)
Expand Down
24 changes: 24 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,27 @@ accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(c)*Float32(filt
# In theory, the following might need to be specialized. For safety, make it a
# standalone function call.
safe_for_prod(x, ref) = oftype(ref, x)

# For LoopVectorization, the easiest path is to convert to native types
native_eltype(::Type{T}) where T<:VectorizationBase.NativeTypes = T, 1
native_eltype(::Type{T}) where T<:FixedPoint = FixedPointNumbers.rawtype(T), FixedPointNumbers.rawone(T)
native_eltype(::Type{Gray{T}}) where T = native_eltype(T)
function native_eltype(::Type{RGB{T}}) where T
T′, f = native_eltype(T)
return SVector{3,T′}, f
end
native_eltype(::Type{T}) where T = T, 1

function maybe_reinterpret(out::AbstractArray, A::AbstractArray, kernel::Tuple{Any,Vararg{Any}})
Tout = eltype(out)
Tout′, outf = native_eltype(Tout)
TA = eltype(A)
TA′, Af = native_eltype(TA)
if Tout′ !== Tout || TA′ !== TA
kernel′ = ((kernel[1]/Af)*outf, Base.tail(kernel)...)
return (Tout === Tout′ ? out : reinterpret(reshape, Tout′, out),
TA === TA′ ? A : reinterpret(reshape, TA′, A),
kernel′)
end
return out, A, kernel
end

0 comments on commit 22e7486

Please sign in to comment.