Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: use LoopVectorization #229

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ author = ["Tim Holy <[email protected]>", "Jan Weidner <[email protected]>"]
version = "0.7.0"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91"
ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
FFTViews = "4f61f5a4-77b1-5117-aa51-3ab5ef4ef0cd"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Expand All @@ -19,16 +21,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
TiledIteration = "06e1c1a7-607b-532d-9fad-de7d9aa2abac"

[compat]
ArrayInterface = "3"
CatIndices = "0.2"
ComputationalResources = "0.3"
DataStructures = "0.17.7, 0.18"
FFTViews = "0.3"
FFTW = "0.3, 1"
ImageCore = "0.9"
LoopVectorization = "0.12.75"
OffsetArrays = "1.9"
Reexport = "1.1"
StaticArrays = "0.10, 0.11, 0.12, 1.0"
TiledIteration = "0.2, 0.3"
TiledIteration = "0.4"
julia = "1"

[extras]
Expand Down
2 changes: 2 additions & 0 deletions src/ImageFiltering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds
using OffsetArrays: IdentityUnitRange # using the one in OffsetArrays makes this work with multiple Julia versions
using SparseArrays # only needed to fix an ambiguity in borderarray
using Reexport
using LoopVectorization
using LoopVectorization.VectorizationBase

@reexport using OffsetArrays: centered # this method once lived here

Expand Down
171 changes: 108 additions & 63 deletions src/imfilter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,10 @@ end
function imfilter!(r::AbstractCPU{FIRTiled{N}}, out::AbstractArray{S,N}, A::AbstractArray{T,N}, kernel::Tuple{Any,Any,Vararg{Any}}, border::NoPad, inds::Indices=axes(out)) where {S,T,N}
kern = kernel[1]
iscopy(kern) && return imfilter!(r, out, A, tail(kernel), border, inds)
tmp = tile_allocate(filter_type(A, kernel), r.settings.tilesize, kernel)
TTile, f = native_eltype(filter_type(A, kernel))
@assert f == 1
# @show r.settings.tilesize
tmp = tile_allocate(TTile, r.settings.tilesize, kernel)
_imfilter_tiled!(r, out, A, kernel, border, tmp, inds)
out
end
Expand Down Expand Up @@ -834,6 +837,7 @@ end

# Single-threaded, pair of kernels (with only one temporary buffer required)
function _imfilter_tiled!(r::CPU1, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
out, A, kernel = maybe_reinterpret(out, A, kernel)
k1, k2 = kernel
tile = tiles[1]
indsk2, indstile = axes(k2), axes(tile)
Expand All @@ -850,6 +854,7 @@ end

# Multithreaded, pair of kernels
function _imfilter_tiled!(r::CPUThreads, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
out, A, kernel = maybe_reinterpret(out, A, kernel)
k1, k2 = kernel
tile = tiles[1]
indsk2, indstile = axes(k2), axes(tile)
Expand Down Expand Up @@ -908,10 +913,8 @@ end
out
end

# The first of the pair in `tmp` has the current data. We also make
# the second a plain array so there's no doubt about who's holding the
# proper indices.
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,Array})
# The first of the pair in `tmp` has the current data.
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,AbstractArray})
tileb1, tile2 = tmp
k1, kt = kernel[1], tail(kernel)
parentinds = axes(tileb1)
Expand All @@ -922,7 +925,7 @@ function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, borde
end

# on the last call we write to `out` instead of one of the buffers
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,Array})
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,AbstractArray})
tileb1 = tmp[1]
k1 = kernel[1]
parentinds = axes(tileb1)
Expand Down Expand Up @@ -1014,26 +1017,26 @@ end
# This is unfortunate, but specializing this saves an add in the inner
# loop and results in a modest performance improvement. It would be
# nice if LLVM did this automatically. (@polly?)
function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
off, k = CartesianIndex(kern.offsets), parent(kern)
o, O = safehead(off), safetail(off)
Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
Rk = CartesianIndices(axes(k))
offA, pA = CartesianIndex(A.offsets), parent(A)
oA, OA = safehead(offA), safetail(offA)
for I in safetail(Rnew)
IA = I-OA
for i in safehead(Rnew)
tmp = z
iA = i-oA
@inbounds for J in safetail(Rk), j in safehead(Rk)
tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
end
@inbounds out[i-o,I-O] = tmp
end
end
out
end
# function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
# off, k = CartesianIndex(kern.offsets), parent(kern)
# o, O = safehead(off), safetail(off)
# Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
# Rk = CartesianIndices(axes(k))
# offA, pA = CartesianIndex(A.offsets), parent(A)
# oA, OA = safehead(offA), safetail(offA)
# for I in safetail(Rnew)
# IA = I-OA
# for i in safehead(Rnew)
# tmp = z
# iA = i-oA
# @inbounds for J in safetail(Rk), j in safehead(Rk)
# tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
# end
# @inbounds out[i-o,I-O] = tmp
# end
# end
# out
# end

function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::ReshapedOneD, border::NoPad, inds)
Rpre, ind, Rpost = iterdims(inds, kern)
Expand All @@ -1043,68 +1046,110 @@ function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::R
return out
end
p = accumfilter(A[first(R)+first(Rk)], first(k))
z = zero(typeof(p+p))
z = float(zero(eltype(A)))
# z = zero(typeof(p+p))
_imfilter_inbounds!(r, z, out, A, k, Rpre, ind, Rpost)
end

# Many of the following are unfortunate specializations
function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
_imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
end
# # Many of the following are unfortunate specializations
# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
# end

function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, koffset=0)
# LoopVectorization.check_type(::Type{T}) where T<:ColorVectorSpace.MathTypes = true
# @generated function VectorizationBase.zero_vecunroll(::StaticInt{N}, ::StaticInt{W}, ::Type{Gray{T}}, ::StaticInt{RS}) where {N,W,T,RS}
# Expr(:block, Expr(:meta, :inline), :(VectorizationBase._vzero(VecUnroll{$(N-1),$W,$T,Vec{$W,$T}}, StaticInt{$RS}())))
# end
# function VectorizationBase._vload_unroll(
# sptr::AbstractStridedPointer{Gray{T},N,C,B}, u::Unroll{AU,F,UN,AV,W,M,UX,I}, ::A, ::StaticInt{RS}, ::StaticInt{X}
# ) where {T<:NativeTypes,N,C,B,AU,F,UN,AV,W,M,UX,I<:IndexNoUnroll,A<:StaticBool,RS,X}
# VectorizationBase.VecUnroll{N,1,T,T}(x::S) where {N,T<:VectorizationBase.NativeTypes,S<:FixedPoint{T}} =
# VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
# VectorizationBase.VecUnroll(x::FixedPoint) = VectorizationBase.VecUnroll(reinterpret(x))
# VectorizationBase.VecUnroll(x::AbstractGray) = VectorizationBase.VecUnroll(gray(x))
# VectorizationBase.VecUnroll(x::Gray) where {N,T<:VectorizationBase.NativeTypes} =
# VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))

const LVTypes = Union{VectorizationBase.NativeTypes, SVector{N,<:VectorizationBase.NativeTypes} where N}

const args = Ref{Any}()
function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray{<:LVTypes}, A::AbstractArray{<:LVTypes}, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
if !LoopVectorization.check_args(out, A)
@show summary(out) summary(A)
args[] = (deepcopy(out), deepcopy(A))
error("this should have worked")
end
indsk = axes(k, 1)
zout = convert(eltype(out), z)
for Ipost in Rpost
for i in ind
ik = i+koffset
for Ipre in Rpre
tmp = z
@turbo for Ipre in Rpre
tmp = zout
for j in indsk
@inbounds tmp += safe_for_prod(A[Ipre,ik+j,Ipost], tmp)*k[j]
tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
end
@inbounds out[Ipre,i,Ipost] = tmp
out[Ipre,i,Ipost] = tmp
end
end
end
out
end

function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
Rpre, ind, Rpost = iterdims(inds, kern)
k = kern.data
R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
if isempty(R) || isempty(Rk)
return out
end
p = accumfilter(A[first(R)+first(Rk)], first(k))
z = zero(typeof(p+p))
Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
_imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
end

function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
_imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
end

function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost, koffset=0)
# No @turbo version
function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
indsk = axes(k, 1)
zout = convert(eltype(out), z)
for Ipost in Rpost
IOpost = Ipost - Opost
for i in ind
io = i-o+koffset
for Ipre in Rpre
IOpre = Ipre - Opre
tmp = z
@inbounds for Ipre in Rpre
tmp = zout
for j in indsk
@inbounds tmp += safe_for_prod(A[IOpre,io+j,IOpost], tmp)*k[j]
tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
end
@inbounds out[Ipre,i,Ipost] = tmp
out[Ipre,i,Ipost] = tmp
end
end
end
out
end
# end unfortunate specializations

# function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
# Rpre, ind, Rpost = iterdims(inds, kern)
# k = kern.data
# R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
# if isempty(R) || isempty(Rk)
# return out
# end
# p = accumfilter(A[first(R)+first(Rk)], first(k))
# z = zero(typeof(p+p))
# Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
# _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
# end

# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
# end

# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
# indsk = axes(k, 1)
# zout = convert(eltype(out), z)
# for Ipost in Rpost
# IOpost = Ipost - Opost
# for i in ind
# io = i-o+koffset
# @turbo for Ipre in Rpre
# IOpre = Ipre - Opre
# tmp = zout
# for j in indsk
# tmp += safe_for_prod(A[IOpre,io+j,IOpost], z)*k[j]
# end
# @inbounds out[Ipre,i,Ipost] = tmp
# end
# end
# end
# out
# end
# # end unfortunate specializations

## commented out because "virtual padding" is commented out
# function _imfilter_iter!(r::AbstractResource, out, padded, kernel::AbstractArray, iter)
Expand Down
3 changes: 3 additions & 0 deletions src/kernelfactors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ for op in (:+, :-, :*, :/)
end
Base.BroadcastStyle(::Type{R}) where {R<:ReshapedOneD{T,N}} where {T,N} = Broadcast.DefaultArrayStyle{N}()

Base.:/(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data / x)
Base.:*(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data * x)

_reshape(A::ReshapedOneD{T,N}, ::Val{N}) where {T,N} = A
_vec(A::ReshapedOneD) = A.data
Base.vec(A::ReshapedOneD) = A.data # is this OK? (note indices won't nec. start with 1)
Expand Down
24 changes: 24 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,27 @@ accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(c)*Float32(filt
# In theory, the following might need to be specialized. For safety, make it a
# standalone function call.
safe_for_prod(x, ref) = oftype(ref, x)

# For LoopVectorization, the easiest path is to convert to native types
native_eltype(::Type{T}) where T<:VectorizationBase.NativeTypes = T, 1
native_eltype(::Type{T}) where T<:FixedPoint = FixedPointNumbers.rawtype(T), FixedPointNumbers.rawone(T)
native_eltype(::Type{Gray{T}}) where T = native_eltype(T)
function native_eltype(::Type{RGB{T}}) where T
T′, f = native_eltype(T)
return SVector{3,T′}, f
end
native_eltype(::Type{T}) where T = T, 1

function maybe_reinterpret(out::AbstractArray, A::AbstractArray, kernel::Tuple{Any,Vararg{Any}})
Tout = eltype(out)
Tout′, outf = native_eltype(Tout)
TA = eltype(A)
TA′, Af = native_eltype(TA)
if Tout′ !== Tout || TA′ !== TA
kernel′ = ((kernel[1]/Af)*outf, Base.tail(kernel)...)
return (Tout === Tout′ ? out : reinterpret(reshape, Tout′, out),
TA === TA′ ? A : reinterpret(reshape, TA′, A),
kernel′)
end
return out, A, kernel
end