From 22e74861188327a28572a1c197cdc866c7ef3cef Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Wed, 22 Sep 2021 17:01:18 -0500 Subject: [PATCH] WIP: use LoopVectorization Preliminary tests suggest ~4x speedups, not too shabby --- Project.toml | 6 +- src/ImageFiltering.jl | 2 + src/imfilter.jl | 171 ++++++++++++++++++++++++++---------------- src/kernelfactors.jl | 3 + src/utils.jl | 24 ++++++ 5 files changed, 142 insertions(+), 64 deletions(-) diff --git a/Project.toml b/Project.toml index 99ffc84..3645b42 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ author = ["Tim Holy ", "Jan Weidner "] version = "0.7.0" [deps] +ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91" ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" @@ -11,6 +12,7 @@ FFTViews = "4f61f5a4-77b1-5117-aa51-3ab5ef4ef0cd" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" @@ -19,16 +21,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" TiledIteration = "06e1c1a7-607b-532d-9fad-de7d9aa2abac" [compat] +ArrayInterface = "3" CatIndices = "0.2" ComputationalResources = "0.3" DataStructures = "0.17.7, 0.18" FFTViews = "0.3" FFTW = "0.3, 1" ImageCore = "0.9" +LoopVectorization = "0.12.75" OffsetArrays = "1.9" Reexport = "1.1" StaticArrays = "0.10, 0.11, 0.12, 1.0" -TiledIteration = "0.2, 0.3" +TiledIteration = "0.4" julia = "1" [extras] diff --git a/src/ImageFiltering.jl b/src/ImageFiltering.jl index 58df01f..1b46868 100644 --- a/src/ImageFiltering.jl +++ b/src/ImageFiltering.jl @@ -10,6 +10,8 @@ using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds using OffsetArrays: IdentityUnitRange # using the one in OffsetArrays makes this work with multiple Julia versions using SparseArrays # only needed to fix an ambiguity in borderarray using Reexport +using LoopVectorization +using LoopVectorization.VectorizationBase @reexport using OffsetArrays: centered # this method once lived here diff --git a/src/imfilter.jl b/src/imfilter.jl index 7faec92..b0be2db 100644 --- a/src/imfilter.jl +++ b/src/imfilter.jl @@ -774,7 +774,10 @@ end function imfilter!(r::AbstractCPU{FIRTiled{N}}, out::AbstractArray{S,N}, A::AbstractArray{T,N}, kernel::Tuple{Any,Any,Vararg{Any}}, border::NoPad, inds::Indices=axes(out)) where {S,T,N} kern = kernel[1] iscopy(kern) && return imfilter!(r, out, A, tail(kernel), border, inds) - tmp = tile_allocate(filter_type(A, kernel), r.settings.tilesize, kernel) + TTile, f = native_eltype(filter_type(A, kernel)) + @assert f == 1 + # @show r.settings.tilesize + tmp = tile_allocate(TTile, r.settings.tilesize, kernel) _imfilter_tiled!(r, out, A, kernel, border, tmp, inds) out end @@ -834,6 +837,7 @@ end # Single-threaded, pair of kernels (with only one temporary buffer required) function _imfilter_tiled!(r::CPU1, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray + out, A, kernel = maybe_reinterpret(out, A, kernel) k1, k2 = kernel tile = tiles[1] indsk2, indstile = axes(k2), axes(tile) @@ -850,6 +854,7 @@ end # Multithreaded, pair of kernels function _imfilter_tiled!(r::CPUThreads, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray + out, A, kernel = maybe_reinterpret(out, A, kernel) k1, k2 = kernel tile = tiles[1] indsk2, indstile = axes(k2), axes(tile) @@ -908,10 +913,8 @@ end out end -# The first of the pair in `tmp` has the current data. We also make -# the second a plain array so there's no doubt about who's holding the -# proper indices. -function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,Array}) +# The first of the pair in `tmp` has the current data. +function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,AbstractArray}) tileb1, tile2 = tmp k1, kt = kernel[1], tail(kernel) parentinds = axes(tileb1) @@ -922,7 +925,7 @@ function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, borde end # on the last call we write to `out` instead of one of the buffers -function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,Array}) +function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,AbstractArray}) tileb1 = tmp[1] k1 = kernel[1] parentinds = axes(tileb1) @@ -1014,26 +1017,26 @@ end # This is unfortunate, but specializing this saves an add in the inner # loop and results in a modest performance improvement. It would be # nice if LLVM did this automatically. (@polly?) -function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z) - off, k = CartesianIndex(kern.offsets), parent(kern) - o, O = safehead(off), safetail(off) - Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off))) - Rk = CartesianIndices(axes(k)) - offA, pA = CartesianIndex(A.offsets), parent(A) - oA, OA = safehead(offA), safetail(offA) - for I in safetail(Rnew) - IA = I-OA - for i in safehead(Rnew) - tmp = z - iA = i-oA - @inbounds for J in safetail(Rk), j in safehead(Rk) - tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J] - end - @inbounds out[i-o,I-O] = tmp - end - end - out -end +# function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z) +# off, k = CartesianIndex(kern.offsets), parent(kern) +# o, O = safehead(off), safetail(off) +# Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off))) +# Rk = CartesianIndices(axes(k)) +# offA, pA = CartesianIndex(A.offsets), parent(A) +# oA, OA = safehead(offA), safetail(offA) +# for I in safetail(Rnew) +# IA = I-OA +# for i in safehead(Rnew) +# tmp = z +# iA = i-oA +# @inbounds for J in safetail(Rk), j in safehead(Rk) +# tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J] +# end +# @inbounds out[i-o,I-O] = tmp +# end +# end +# out +# end function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::ReshapedOneD, border::NoPad, inds) Rpre, ind, Rpost = iterdims(inds, kern) @@ -1043,68 +1046,110 @@ function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::R return out end p = accumfilter(A[first(R)+first(Rk)], first(k)) - z = zero(typeof(p+p)) + z = float(zero(eltype(A))) + # z = zero(typeof(p+p)) _imfilter_inbounds!(r, z, out, A, k, Rpre, ind, Rpost) end -# Many of the following are unfortunate specializations -function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices) - _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1]) -end +# # Many of the following are unfortunate specializations +# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices) +# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1]) +# end -function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, koffset=0) +# LoopVectorization.check_type(::Type{T}) where T<:ColorVectorSpace.MathTypes = true +# @generated function VectorizationBase.zero_vecunroll(::StaticInt{N}, ::StaticInt{W}, ::Type{Gray{T}}, ::StaticInt{RS}) where {N,W,T,RS} +# Expr(:block, Expr(:meta, :inline), :(VectorizationBase._vzero(VecUnroll{$(N-1),$W,$T,Vec{$W,$T}}, StaticInt{$RS}()))) +# end +# function VectorizationBase._vload_unroll( +# sptr::AbstractStridedPointer{Gray{T},N,C,B}, u::Unroll{AU,F,UN,AV,W,M,UX,I}, ::A, ::StaticInt{RS}, ::StaticInt{X} +# ) where {T<:NativeTypes,N,C,B,AU,F,UN,AV,W,M,UX,I<:IndexNoUnroll,A<:StaticBool,RS,X} +# VectorizationBase.VecUnroll{N,1,T,T}(x::S) where {N,T<:VectorizationBase.NativeTypes,S<:FixedPoint{T}} = + # VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x)) +# VectorizationBase.VecUnroll(x::FixedPoint) = VectorizationBase.VecUnroll(reinterpret(x)) +# VectorizationBase.VecUnroll(x::AbstractGray) = VectorizationBase.VecUnroll(gray(x)) +# VectorizationBase.VecUnroll(x::Gray) where {N,T<:VectorizationBase.NativeTypes} = +# VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x)) + +const LVTypes = Union{VectorizationBase.NativeTypes, SVector{N,<:VectorizationBase.NativeTypes} where N} + +const args = Ref{Any}() +function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray{<:LVTypes}, A::AbstractArray{<:LVTypes}, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices) + if !LoopVectorization.check_args(out, A) + @show summary(out) summary(A) + args[] = (deepcopy(out), deepcopy(A)) + error("this should have worked") + end indsk = axes(k, 1) + zout = convert(eltype(out), z) for Ipost in Rpost for i in ind - ik = i+koffset - for Ipre in Rpre - tmp = z + @turbo for Ipre in Rpre + tmp = zout for j in indsk - @inbounds tmp += safe_for_prod(A[Ipre,ik+j,Ipost], tmp)*k[j] + tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j] end - @inbounds out[Ipre,i,Ipost] = tmp + out[Ipre,i,Ipost] = tmp end end end out end -function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds) - Rpre, ind, Rpost = iterdims(inds, kern) - k = kern.data - R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern)) - if isempty(R) || isempty(Rk) - return out - end - p = accumfilter(A[first(R)+first(Rk)], first(k)) - z = zero(typeof(p+p)) - Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern) - _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost) -end - -function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost) - _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1]) -end - -function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost, koffset=0) +# No @turbo version +function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices) indsk = axes(k, 1) + zout = convert(eltype(out), z) for Ipost in Rpost - IOpost = Ipost - Opost for i in ind - io = i-o+koffset - for Ipre in Rpre - IOpre = Ipre - Opre - tmp = z + @inbounds for Ipre in Rpre + tmp = zout for j in indsk - @inbounds tmp += safe_for_prod(A[IOpre,io+j,IOpost], tmp)*k[j] + tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j] end - @inbounds out[Ipre,i,Ipost] = tmp + out[Ipre,i,Ipost] = tmp end end end out end -# end unfortunate specializations + +# function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds) +# Rpre, ind, Rpost = iterdims(inds, kern) +# k = kern.data +# R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern)) +# if isempty(R) || isempty(Rk) +# return out +# end +# p = accumfilter(A[first(R)+first(Rk)], first(k)) +# z = zero(typeof(p+p)) +# Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern) +# _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost) +# end + +# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost) +# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1]) +# end + +# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost) +# indsk = axes(k, 1) +# zout = convert(eltype(out), z) +# for Ipost in Rpost +# IOpost = Ipost - Opost +# for i in ind +# io = i-o+koffset +# @turbo for Ipre in Rpre +# IOpre = Ipre - Opre +# tmp = zout +# for j in indsk +# tmp += safe_for_prod(A[IOpre,io+j,IOpost], z)*k[j] +# end +# @inbounds out[Ipre,i,Ipost] = tmp +# end +# end +# end +# out +# end +# # end unfortunate specializations ## commented out because "virtual padding" is commented out # function _imfilter_iter!(r::AbstractResource, out, padded, kernel::AbstractArray, iter) diff --git a/src/kernelfactors.jl b/src/kernelfactors.jl index 8e8f6e3..39b8fc1 100644 --- a/src/kernelfactors.jl +++ b/src/kernelfactors.jl @@ -111,6 +111,9 @@ for op in (:+, :-, :*, :/) end Base.BroadcastStyle(::Type{R}) where {R<:ReshapedOneD{T,N}} where {T,N} = Broadcast.DefaultArrayStyle{N}() +Base.:/(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data / x) +Base.:*(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data * x) + _reshape(A::ReshapedOneD{T,N}, ::Val{N}) where {T,N} = A _vec(A::ReshapedOneD) = A.data Base.vec(A::ReshapedOneD) = A.data # is this OK? (note indices won't nec. start with 1) diff --git a/src/utils.jl b/src/utils.jl index 3f74fa1..b00c712 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -131,3 +131,27 @@ accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(c)*Float32(filt # In theory, the following might need to be specialized. For safety, make it a # standalone function call. safe_for_prod(x, ref) = oftype(ref, x) + +# For LoopVectorization, the easiest path is to convert to native types +native_eltype(::Type{T}) where T<:VectorizationBase.NativeTypes = T, 1 +native_eltype(::Type{T}) where T<:FixedPoint = FixedPointNumbers.rawtype(T), FixedPointNumbers.rawone(T) +native_eltype(::Type{Gray{T}}) where T = native_eltype(T) +function native_eltype(::Type{RGB{T}}) where T + T′, f = native_eltype(T) + return SVector{3,T′}, f +end +native_eltype(::Type{T}) where T = T, 1 + +function maybe_reinterpret(out::AbstractArray, A::AbstractArray, kernel::Tuple{Any,Vararg{Any}}) + Tout = eltype(out) + Tout′, outf = native_eltype(Tout) + TA = eltype(A) + TA′, Af = native_eltype(TA) + if Tout′ !== Tout || TA′ !== TA + kernel′ = ((kernel[1]/Af)*outf, Base.tail(kernel)...) + return (Tout === Tout′ ? out : reinterpret(reshape, Tout′, out), + TA === TA′ ? A : reinterpret(reshape, TA′, A), + kernel′) + end + return out, A, kernel +end