From 25dbbbe9b5632c6f0a0039eb41834eac93c4a4ff Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Fri, 12 Jan 2024 13:49:25 -0500
Subject: [PATCH 1/8] allow providing fft plans

---
 .github/workflows/UnitTest.yml | 21 +++------
 Project.toml                   |  3 ++
 demo.jl                        | 79 ++++++++++++++++++++++++++++++++++
 src/ImageFiltering.jl          | 13 +++++-
 src/imfilter.jl                | 54 +++++++++++++++++++++--
 test/2d.jl                     | 40 ++++++++++-------
 test/gabor.jl                  | 14 +++---
 test/nd.jl                     |  7 ++-
 test/runtests.jl               | 16 ++++---
 9 files changed, 199 insertions(+), 48 deletions(-)
 create mode 100644 demo.jl

diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
index 36f9a763..b2410dde 100644
--- a/.github/workflows/UnitTest.yml
+++ b/.github/workflows/UnitTest.yml
@@ -7,8 +7,6 @@ on:
     branches:
       - master
   pull_request:
-  schedule:
-    - cron: '20 00 1 * *'
 
 jobs:
   test:
@@ -20,23 +18,18 @@ jobs:
         os: [ubuntu-latest, windows-latest, macOS-latest]
 
     steps:
-      - uses: actions/checkout@v1.0.0
+      - uses: actions/checkout@v4
       - name: "Set up Julia"
         uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.julia-version }}
 
-      - name: Cache artifacts
-        uses: actions/cache@v1
-        env:
-          cache-name: cache-artifacts
-        with:
-          path: ~/.julia/artifacts
-          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
-          restore-keys: |
-            ${{ runner.os }}-test-${{ env.cache-name }}-
-            ${{ runner.os }}-test-
-            ${{ runner.os }}-
+      - uses: julia-actions/cache@v1
+
+      - run: |
+          import Pkg
+          Pkg.add(Pkg.PackageSpec(url="https://github.com/HolyLab/RFFT.jl", rev="ib/add_copy"))
+        shell: julia --project --color=yes {0}
 
       - name: "Unit Test"
         uses: julia-actions/julia-runtest@v1
diff --git a/Project.toml b/Project.toml
index f281ce96..021fb11b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ author = ["Tim Holy <tim.holy@gmail.com>", "Jan Weidner <jw3126@gmail.com>"]
 version = "0.7.8"
 
 [deps]
+AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91"
 ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -15,6 +16,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+RFFT = "3bd9afcd-55df-531a-9b34-dc642dce7b95"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -31,6 +33,7 @@ ImageCore = "0.10"
 OffsetArrays = "1.9"
 PrecompileTools = "1"
 Reexport = "1.1"
+RFFT = "0.1.1"
 StaticArrays = "0.10, 0.11, 0.12, 1.0"
 Statistics = "1"
 TiledIteration = "0.2, 0.3, 0.4, 0.5"
diff --git a/demo.jl b/demo.jl
new file mode 100644
index 00000000..7743112e
--- /dev/null
+++ b/demo.jl
@@ -0,0 +1,79 @@
+using ImageFiltering, FFTW, LinearAlgebra, Profile, Random
+# using ProfileView
+using ComputationalResources
+
+FFTW.set_num_threads(parse(Int, get(ENV, "FFTW_NUM_THREADS", "1")))
+BLAS.set_num_threads(parse(Int, get(ENV, "BLAS_NUM_THREADS", string(Threads.nthreads() ÷ 2))))
+
+function benchmark(mats)
+    kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
+    Threads.@threads for mat in mats
+        frame_filtered = deepcopy(mat[:, :, 1])
+        r_cached = CPU1(ImageFiltering.planned_fft(frame_filtered, kernel))
+        for i in axes(mat, 3)
+            frame = @view mat[:, :, i]
+            imfilter!(r_cached, frame_filtered, frame, kernel)
+        end
+        return
+    end
+end
+
+function test(mats)
+    kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
+    for mat in mats
+        f1 = deepcopy(mat[:, :, 1])
+        r_cached = CPU1(ImageFiltering.planned_fft(f1, kernel))
+        f2 = deepcopy(mat[:, :, 1])
+        r_noncached = CPU1(Algorithm.FFT())
+        for i in axes(mat, 3)
+            frame = @view mat[:, :, i]
+            @info "imfilter! noncached"
+            imfilter!(r_noncached, f2, frame, kernel)
+            @info "imfilter! cached"
+            imfilter!(r_cached, f1, frame, kernel)
+            @show f1[1:4] f2[1:4]
+            f1 ≈ f2 || error("f1 !≈ f2")
+        end
+        return
+    end
+end
+
+function profile()
+    Random.seed!(1)
+    nmats = 10
+    mats = [rand(Float32, rand(80:100), rand(80:100), rand(2000:3000)) for _ in 1:nmats]
+    GC.gc(true)
+
+    # benchmark(mats)
+
+    # for _ in 1:3
+    #     @time "warm run of benchmark(mats)" benchmark(mats)
+    # end
+
+    test(mats)
+
+    # Profile.clear()
+    # @profile benchmark(mats)
+
+    # Profile.print(IOContext(stdout, :displaysize => (24, 200)); C=true, combine=true, mincount=100)
+    # # ProfileView.view()
+    # GC.gc(true)
+end
+
+profile()
+
+using ImageFiltering
+using ImageFiltering.RFFT
+
+function mwe()
+    a = rand(Float64, 10, 10)
+    out1 = rfft(a)
+
+    buf = RFFT.RCpair{Float64}(undef, size(a))
+    rfft_plan = RFFT.plan_rfft!(buf)
+    copy!(buf, a)
+    out2 = complex(rfft_plan(buf))
+
+    return out1 ≈ out2
+end
+mwe()
\ No newline at end of file
diff --git a/src/ImageFiltering.jl b/src/ImageFiltering.jl
index bbb164e2..582e018f 100644
--- a/src/ImageFiltering.jl
+++ b/src/ImageFiltering.jl
@@ -1,12 +1,14 @@
 module ImageFiltering
 
 using FFTW
+using RFFT
 using ImageCore, FFTViews, OffsetArrays, StaticArrays, ComputationalResources, TiledIteration
 # Where possible we avoid a direct dependency to reduce the number of [compat] bounds
 # using FixedPointNumbers: Normed, N0f8 # reexported by ImageCore
 using ImageCore.MappedArrays
 using Statistics, LinearAlgebra
 using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds
+import Base: copy!
 using OffsetArrays: IdentityUnitRange   # using the one in OffsetArrays makes this work with multiple Julia versions
 using SparseArrays   # only needed to fix an ambiguity in borderarray
 using Reexport
@@ -30,7 +32,8 @@ export Kernel, KernelFactors,
     imgradients, padarray, centered, kernelfactors, reflect,
     freqkernel, spacekernel,
     findlocalminima, findlocalmaxima,
-    blob_LoG, BlobLoG
+    blob_LoG, BlobLoG,
+    planned_fft
 
 FixedColorant{T<:Normed} = Colorant{T}
 StaticOffsetArray{T,N,A<:StaticArray} = OffsetArray{T,N,A}
@@ -50,10 +53,16 @@ function Base.transpose(A::StaticOffsetArray{T,2}) where T
 end
 
 module Algorithm
+    import FFTW
     # deliberately don't export these, but it's expected that they
     # will be used as Algorithm.FFT(), etc.
     abstract type Alg end
-    "Filter using the Fast Fourier Transform" struct FFT <: Alg end
+    "Filter using the Fast Fourier Transform" struct FFT <: Alg
+        plan1::Union{Function,Nothing}
+        plan2::Union{Function,Nothing}
+        plan3::Union{Function,Nothing}
+    end
+    FFT() = FFT(nothing, nothing, nothing)
     "Filter using a direct algorithm" struct FIR <: Alg end
     "Cache-efficient filtering using tiles" struct FIRTiled{N} <: Alg
         tilesize::Dims{N}
diff --git a/src/imfilter.jl b/src/imfilter.jl
index 283352b8..ab9b2f57 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -826,7 +826,7 @@ function _imfilter_fft!(r::AbstractCPU{FFT},
     for I in CartesianIndices(axes(kern))
         krn[I] = kern[I]
     end
-    Af = filtfft(A, krn)
+    Af = filtfft(A, krn, r.settings.plan1, r.settings.plan2, r.settings.plan3)
     if map(first, axes(out)) == map(first, axes(Af))
         R = CartesianIndices(axes(out))
         copyto!(out, R, Af, R)
@@ -837,13 +837,61 @@ function _imfilter_fft!(r::AbstractCPU{FFT},
         src = view(FFTView(Af), axes(dest)...)
         copyto!(dest, src)
     end
-    out
+    return out
+end
+
+function buffered_planned_rfft(a::AbstractArray{T}) where {T}
+    buf = RFFT.RCpair{T}(undef, size(a))
+    plan = RFFT.plan_rfft!(buf; flags=FFTW.MEASURE)
+    return function (arr::AbstractArray{T}) where {T}
+        copy!(buf, OffsetArrays.no_offset_view(arr))
+        return plan(buf)
+    end
+end
+function buffered_planned_irfft(a::AbstractArray{T}) where {T}
+    buf = RFFT.RCpair{T}(undef, size(a))
+    plan = RFFT.plan_irfft!(buf; flags=FFTW.MEASURE)
+    return function (arr::AbstractArray{T}) where {T}
+        copy!(buf, OffsetArrays.no_offset_view(arr))
+        return plan(buf)
+    end
 end
 
+function planned_fft(A::AbstractArray{T,N},
+            kernel::ProcessedKernel,
+            border::BorderSpecAny=Pad(:replicate)
+            ) where {T<:AbstractFloat,N}
+    bord = border(kernel, A, Algorithm.FFT())
+    _A = padarray(T, A, bord)
+    bfp1 = buffered_planned_rfft(_A)
+    kern = samedims(_A, kernelconv(kernel...))
+    krn = FFTView(zeros(eltype(kern), map(length, axes(_A))))
+    bfp2 = buffered_planned_rfft(krn)
+    bfp3 = buffered_planned_irfft(_A)
+    return Algorithm.FFT(bfp1, bfp2, bfp3)
+end
+planned_fft(A::AbstractArray, kernel, border::AbstractString) = planned_fft(A, kernel, borderinstance(border))
+planned_fft(A::AbstractArray, kernel::Union{ArrayLike,Laplacian}, border::BorderSpecAny) = planned_fft(A, factorkernel(kernel), border)
+
+function filtfft(A, krn, planned_rfft1::Function, planned_rfft2::Function, planned_irfft::Function)
+    B = complex(planned_rfft1(A))
+    B .*= conj!(complex(planned_rfft2(krn)))
+    return real(planned_irfft(complex(B)))
+end
+# TODO: this does not work. See TODO below
+function filtfft(A::AbstractArray{C}, krn, planned_rfft1::Function, planned_rfft2::Function, planned_irfft::Function) where {C<:Colorant}
+    Av, dims = channelview_dims(A)
+    kernrs = kreshape(C, krn)
+    B = complex(planned_rfft1(Av, dims)) # TODO: dims is not supported by planned_rfft1
+    B .*= conj!(complex(planned_rfft2(kernrs)))
+    Avf = real(planned_irfft(complex(B)))
+    return colorview(base_colorant_type(C){eltype(Avf)}, Avf)
+end
+filtfft(A, krn, ::Nothing, ::Nothing, ::Nothing) = filtfft(A, krn)
 function filtfft(A, krn)
     B = rfft(A)
     B .*= conj!(rfft(krn))
-    irfft(B, length(axes(A, 1)))
+    return irfft(B, length(axes(A, 1)))
 end
 function filtfft(A::AbstractArray{C}, krn) where {C<:Colorant}
     Av, dims = channelview_dims(A)
diff --git a/test/2d.jl b/test/2d.jl
index 37d8fd29..25504630 100644
--- a/test/2d.jl
+++ b/test/2d.jl
@@ -36,6 +36,15 @@ using ImageFiltering: borderinstance
     end
 end
 
+function supported_algs(img, kernel, border)
+    if eltype(img) isa AbstractFloat
+        (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT(), planned_fft(img, kernel, border))
+    else
+        # TODO: extend planned_fft to support other types
+        (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+    end
+end
+
 @testset "FIR/FFT" begin
     f32type(img) = f32type(eltype(img))
     f32type(::Type{C}) where {C<:Colorant} = base_colorant_type(C){Float32}
@@ -50,6 +59,7 @@ end
     # Dense inseparable kernel
     kern = [0.1 0.2; 0.4 0.5]
     kernel = OffsetArray(kern, -1:0, 1:2)
+    border = Inner()
     for img in (imgf, imgi, imgg, imgc)
         targetimg = zeros(typeof(img[1]*kern[1]), size(img))
         targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
@@ -66,7 +76,7 @@ end
             @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
             fill!(ret, zero(eltype(ret)))
             @test @inferred(imfilter!(ret, img, kernel, border)) ≈ targetimg
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+            for alg in supported_algs(img, kernel, border)
                 @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
                 @test @inferred(imfilter(img, (kernel,), border, alg)) ≈ targetimg
                 @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
@@ -76,12 +86,12 @@ end
             @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
         end
         targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
-        @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
-        @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
-            @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
-            @test @inferred(imfilter(CPU1(alg), img, kernel, Inner())) ≈ targetimg_inner
+        @test @inferred(imfilter(img, kernel, border)) ≈ targetimg_inner
+        @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg_inner)
+        for alg in supported_algs(img, kernel, border)
+            @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg_inner
+            @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg_inner)
+            @test @inferred(imfilter(CPU1(alg), img, kernel, border)) ≈ targetimg_inner
         end
     end
     # Factored kernel
@@ -96,7 +106,7 @@ end
         for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
             @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
             @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+            for alg in supported_algs(img, kernel, border)
                 @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
                 @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
             end
@@ -106,7 +116,7 @@ end
         targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
         @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
         @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+        for alg in supported_algs(img, kernel, border)
             @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
             @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
         end
@@ -122,7 +132,7 @@ end
         for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
             @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
             @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+            for alg in supported_algs(img, kernel, border)
                 if alg == Algorithm.FFT() && eltype(img) == Int
                     @test @inferred(imfilter(Float64, img, kernel, border, alg)) ≈ targetimg
                 else
@@ -134,7 +144,7 @@ end
         targetimg_inner = OffsetArray(targetimg[2:end-1, 2:end-1], 2:4, 2:6)
         @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
         @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+        for alg in supported_algs(img, kernel, border)
             if alg == Algorithm.FFT() && eltype(img) == Int
                 @test @inferred(imfilter(Float64, img, kernel, Inner(), alg)) ≈ targetimg_inner
             else
@@ -184,7 +194,7 @@ end
             targetimg = target1(img, kern, border)
             @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
             @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+            for alg in supported_algs(img, kernel, border)
                 @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
                 @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
             end
@@ -195,7 +205,7 @@ end
         targetimg = zerona!(copy(targetimg0))
         @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
         @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+        for alg in supported_algs(img, kernel, border)
             @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
             @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
         end
@@ -208,7 +218,7 @@ end
             targetimg = target1(img, kern, border)
             @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
             @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+            for alg in supported_algs(img, kernel, border)
                 @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
                 @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
             end
@@ -219,7 +229,7 @@ end
         targetimg = zerona!(copy(targetimg0))
         @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
         @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+        for alg in supported_algs(img, kernel, border)
             @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
             @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
         end
diff --git a/test/gabor.jl b/test/gabor.jl
index 5c77508e..faa49cb4 100644
--- a/test/gabor.jl
+++ b/test/gabor.jl
@@ -7,11 +7,15 @@ using ImageFiltering, Test, Statistics
     size_y = 6*σy+1
     γ = σx/σy
     # zero size forces default kernel width, with warnings
-    @info "Four warnings are expected"
-    kernel = Kernel.gabor(0,0,σx,0,5,γ,0)
-    @test isequal(size(kernel[1]),(size_x,size_y))
-    kernel = Kernel.gabor(0,0,σx,π,5,γ,0)
-    @test isequal(size(kernel[1]),(size_x,size_y))
+
+    @test_logs (:warn, r"The input parameter size_") match_mode=:any begin
+        kernel = Kernel.gabor(0,0,σx,0,5,γ,0)
+        @test isequal(size(kernel[1]),(size_x,size_y))
+    end
+    @test_logs (:warn, r"The input parameter size_") match_mode=:any begin
+        kernel = Kernel.gabor(0,0,σx,π,5,γ,0)
+        @test isequal(size(kernel[1]),(size_x,size_y))
+    end
 
     for x in 0:4, y in 0:4, z in 0:4, t in 0:4
         σx1 = 2*x+1
diff --git a/test/nd.jl b/test/nd.jl
index 8842743f..76c1006d 100644
--- a/test/nd.jl
+++ b/test/nd.jl
@@ -50,6 +50,8 @@ Base.zero(::Type{WrappedFloat}) = WrappedFloat(0.0)
     v = fill(0xff, 10)
     kern = centered(fill(0xff, 3))
     @info "Two warnings are expected"
+    # TODO: use @test_logs (:warn, r"Likely overflow or conversion error detected") match_mode=:any
+    # julia has an internal error currently on 1.10.2 that this hits https://github.com/JuliaLang/julia/pull/50759
     @test_throws InexactError imfilter(v, kern)
     vout = imfilter(UInt32, v, kern)
     @test eltype(vout) == UInt32
@@ -106,8 +108,8 @@ Base.zero(::Type{WrappedFloat}) = WrappedFloat(0.0)
         around_i = [abs(i-j) <= 15 for j in eachindex(v)]
         @test all(isequal(x), wf[around_i])
         @test wf[.!around_i] ≈ vf[.!around_i]
-    end   
-   
+    end
+
     # Issue #110
     img = reinterpret(WrappedFloat, rand(128))
     kern = centered(rand(31))
@@ -129,6 +131,7 @@ end
     img = trues(10,10,10)
     kernel = centered(trues(3,3,3)/27)
     for border in ("replicate", "circular", "symmetric", "reflect", Fill(true))
+        # TODO: add support for boolean images in planned_fft
         for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
             @test imfilter(img, kernel, border) ≈ img
         end
diff --git a/test/runtests.jl b/test/runtests.jl
index cfb5ae59..955179dc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,13 @@ using ImageQualityIndexes
 import StaticArrays
 using Random
 
+function typestring(::Type{T}) where T   # from https://github.com/JuliaImages/ImageCore.jl/pull/133
+    buf = IOBuffer()
+    show(buf, T)
+    String(take!(buf))
+end
+
+@testset "ImageFiltering" verbose=true begin
 @testset "Project meta quality checks" begin
     # Ambiguity test
     if Base.VERSION >= v"1.6.0-DEV.1005"   # julia #37616
@@ -26,12 +33,6 @@ using Random
     end
 end
 
-function typestring(::Type{T}) where T   # from https://github.com/JuliaImages/ImageCore.jl/pull/133
-    buf = IOBuffer()
-    show(buf, T)
-    String(take!(buf))
-end
-
 include("compat.jl")
 include("border.jl")
 include("nd.jl")
@@ -49,7 +50,6 @@ include("models.jl")
 
 CUDA_INSTALLED = false
 try
-    global CUDA_INSTALLED
     # This errors with `IOError` when nvidia driver is not available,
     # in which case we don't even need to try `using CUDA`
     run(pipeline(`nvidia-smi`, stdout=devnull, stderr=devnull))
@@ -71,3 +71,5 @@ else
     @warn "CUDA test: disabled"
 end
 nothing
+
+end

From a948fae6abbb948d1977da9bad8cfef246fc5cd4 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Mon, 11 Mar 2024 15:22:17 -0400
Subject: [PATCH 2/8] simplify demo

---
 demo.jl | 64 +++++++++++++++++++++++----------------------------------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/demo.jl b/demo.jl
index 7743112e..6075b1b4 100644
--- a/demo.jl
+++ b/demo.jl
@@ -1,11 +1,10 @@
 using ImageFiltering, FFTW, LinearAlgebra, Profile, Random
-# using ProfileView
 using ComputationalResources
 
 FFTW.set_num_threads(parse(Int, get(ENV, "FFTW_NUM_THREADS", "1")))
 BLAS.set_num_threads(parse(Int, get(ENV, "BLAS_NUM_THREADS", string(Threads.nthreads() ÷ 2))))
 
-function benchmark(mats)
+function benchmark_new(mats)
     kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
     Threads.@threads for mat in mats
         frame_filtered = deepcopy(mat[:, :, 1])
@@ -17,6 +16,18 @@ function benchmark(mats)
         return
     end
 end
+function benchmark_old(mats)
+    kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
+    Threads.@threads for mat in mats
+        frame_filtered = deepcopy(mat[:, :, 1])
+        r_noncached = CPU1(Algorithm.FFT())
+        for i in axes(mat, 3)
+            frame = @view mat[:, :, i]
+            imfilter!(r_noncached, frame_filtered, frame, kernel)
+        end
+        return
+    end
+end
 
 function test(mats)
     kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
@@ -26,11 +37,8 @@ function test(mats)
         f2 = deepcopy(mat[:, :, 1])
         r_noncached = CPU1(Algorithm.FFT())
         for i in axes(mat, 3)
-            frame = @view mat[:, :, i]
-            @info "imfilter! noncached"
-            imfilter!(r_noncached, f2, frame, kernel)
-            @info "imfilter! cached"
-            imfilter!(r_cached, f1, frame, kernel)
+            imfilter!(r_noncached, f2, deepcopy(mat[:, :, i]), kernel)
+            imfilter!(r_cached, f1, deepcopy(mat[:, :, i]), kernel)
             @show f1[1:4] f2[1:4]
             f1 ≈ f2 || error("f1 !≈ f2")
         end
@@ -38,42 +46,22 @@ function test(mats)
     end
 end
 
-function profile()
+function run()
     Random.seed!(1)
     nmats = 10
-    mats = [rand(Float32, rand(80:100), rand(80:100), rand(2000:3000)) for _ in 1:nmats]
-    GC.gc(true)
+    mats = [rand(Float64, rand(80:100), rand(80:100), rand(2000:3000)) for _ in 1:nmats]
 
-    # benchmark(mats)
+    benchmark_new(mats)
+    for _ in 1:3
+        @time "warm run of benchmark_new(mats)" benchmark_new(mats)
+    end
 
-    # for _ in 1:3
-    #     @time "warm run of benchmark(mats)" benchmark(mats)
-    # end
+    benchmark_old(mats)
+    for _ in 1:3
+        @time "warm run of benchmark_old(mats)" benchmark_old(mats)
+    end
 
     test(mats)
-
-    # Profile.clear()
-    # @profile benchmark(mats)
-
-    # Profile.print(IOContext(stdout, :displaysize => (24, 200)); C=true, combine=true, mincount=100)
-    # # ProfileView.view()
-    # GC.gc(true)
 end
 
-profile()
-
-using ImageFiltering
-using ImageFiltering.RFFT
-
-function mwe()
-    a = rand(Float64, 10, 10)
-    out1 = rfft(a)
-
-    buf = RFFT.RCpair{Float64}(undef, size(a))
-    rfft_plan = RFFT.plan_rfft!(buf)
-    copy!(buf, a)
-    out2 = complex(rfft_plan(buf))
-
-    return out1 ≈ out2
-end
-mwe()
\ No newline at end of file
+run()

From 6ef477e1ada42a45875f569bc11a3f72c773610c Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 7 May 2024 09:15:26 -0400
Subject: [PATCH 3/8] RFFT -> RealFFTs & tidyup

---
 .github/workflows/UnitTest.yml |  7 +---
 Project.toml                   |  5 +--
 demo.jl                        | 67 ----------------------------------
 src/ImageFiltering.jl          |  2 +-
 src/imfilter.jl                |  8 ++--
 5 files changed, 8 insertions(+), 81 deletions(-)
 delete mode 100644 demo.jl

diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
index b2410dde..6d0f2912 100644
--- a/.github/workflows/UnitTest.yml
+++ b/.github/workflows/UnitTest.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        julia-version: ['1.6', '1', 'nightly']
+        julia-version: ['1.6', '1']
         os: [ubuntu-latest, windows-latest, macOS-latest]
 
     steps:
@@ -26,11 +26,6 @@ jobs:
 
       - uses: julia-actions/cache@v1
 
-      - run: |
-          import Pkg
-          Pkg.add(Pkg.PackageSpec(url="https://github.com/HolyLab/RFFT.jl", rev="ib/add_copy"))
-        shell: julia --project --color=yes {0}
-
       - name: "Unit Test"
         uses: julia-actions/julia-runtest@v1
 
diff --git a/Project.toml b/Project.toml
index 021fb11b..9ac19586 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,7 +4,6 @@ author = ["Tim Holy <tim.holy@gmail.com>", "Jan Weidner <jw3126@gmail.com>"]
 version = "0.7.8"
 
 [deps]
-AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91"
 ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -15,8 +14,8 @@ ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+RealFFTs = "3645faec-0534-4e91-afef-021db0981eec"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-RFFT = "3bd9afcd-55df-531a-9b34-dc642dce7b95"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -33,7 +32,7 @@ ImageCore = "0.10"
 OffsetArrays = "1.9"
 PrecompileTools = "1"
 Reexport = "1.1"
-RFFT = "0.1.1"
+RealFFTs = "1"
 StaticArrays = "0.10, 0.11, 0.12, 1.0"
 Statistics = "1"
 TiledIteration = "0.2, 0.3, 0.4, 0.5"
diff --git a/demo.jl b/demo.jl
deleted file mode 100644
index 6075b1b4..00000000
--- a/demo.jl
+++ /dev/null
@@ -1,67 +0,0 @@
-using ImageFiltering, FFTW, LinearAlgebra, Profile, Random
-using ComputationalResources
-
-FFTW.set_num_threads(parse(Int, get(ENV, "FFTW_NUM_THREADS", "1")))
-BLAS.set_num_threads(parse(Int, get(ENV, "BLAS_NUM_THREADS", string(Threads.nthreads() ÷ 2))))
-
-function benchmark_new(mats)
-    kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
-    Threads.@threads for mat in mats
-        frame_filtered = deepcopy(mat[:, :, 1])
-        r_cached = CPU1(ImageFiltering.planned_fft(frame_filtered, kernel))
-        for i in axes(mat, 3)
-            frame = @view mat[:, :, i]
-            imfilter!(r_cached, frame_filtered, frame, kernel)
-        end
-        return
-    end
-end
-function benchmark_old(mats)
-    kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
-    Threads.@threads for mat in mats
-        frame_filtered = deepcopy(mat[:, :, 1])
-        r_noncached = CPU1(Algorithm.FFT())
-        for i in axes(mat, 3)
-            frame = @view mat[:, :, i]
-            imfilter!(r_noncached, frame_filtered, frame, kernel)
-        end
-        return
-    end
-end
-
-function test(mats)
-    kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
-    for mat in mats
-        f1 = deepcopy(mat[:, :, 1])
-        r_cached = CPU1(ImageFiltering.planned_fft(f1, kernel))
-        f2 = deepcopy(mat[:, :, 1])
-        r_noncached = CPU1(Algorithm.FFT())
-        for i in axes(mat, 3)
-            imfilter!(r_noncached, f2, deepcopy(mat[:, :, i]), kernel)
-            imfilter!(r_cached, f1, deepcopy(mat[:, :, i]), kernel)
-            @show f1[1:4] f2[1:4]
-            f1 ≈ f2 || error("f1 !≈ f2")
-        end
-        return
-    end
-end
-
-function run()
-    Random.seed!(1)
-    nmats = 10
-    mats = [rand(Float64, rand(80:100), rand(80:100), rand(2000:3000)) for _ in 1:nmats]
-
-    benchmark_new(mats)
-    for _ in 1:3
-        @time "warm run of benchmark_new(mats)" benchmark_new(mats)
-    end
-
-    benchmark_old(mats)
-    for _ in 1:3
-        @time "warm run of benchmark_old(mats)" benchmark_old(mats)
-    end
-
-    test(mats)
-end
-
-run()
diff --git a/src/ImageFiltering.jl b/src/ImageFiltering.jl
index 582e018f..860c07e0 100644
--- a/src/ImageFiltering.jl
+++ b/src/ImageFiltering.jl
@@ -1,7 +1,7 @@
 module ImageFiltering
 
 using FFTW
-using RFFT
+using RealFFTs
 using ImageCore, FFTViews, OffsetArrays, StaticArrays, ComputationalResources, TiledIteration
 # Where possible we avoid a direct dependency to reduce the number of [compat] bounds
 # using FixedPointNumbers: Normed, N0f8 # reexported by ImageCore
diff --git a/src/imfilter.jl b/src/imfilter.jl
index ab9b2f57..a7fd760d 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -841,16 +841,16 @@ function _imfilter_fft!(r::AbstractCPU{FFT},
 end
 
 function buffered_planned_rfft(a::AbstractArray{T}) where {T}
-    buf = RFFT.RCpair{T}(undef, size(a))
-    plan = RFFT.plan_rfft!(buf; flags=FFTW.MEASURE)
+    buf = RealFFTs.RCpair{T}(undef, size(a))
+    plan = RealFFTs.plan_rfft!(buf; flags=FFTW.MEASURE)
     return function (arr::AbstractArray{T}) where {T}
         copy!(buf, OffsetArrays.no_offset_view(arr))
         return plan(buf)
     end
 end
 function buffered_planned_irfft(a::AbstractArray{T}) where {T}
-    buf = RFFT.RCpair{T}(undef, size(a))
-    plan = RFFT.plan_irfft!(buf; flags=FFTW.MEASURE)
+    buf = RealFFTs.RCpair{T}(undef, size(a))
+    plan = RealFFTs.plan_irfft!(buf; flags=FFTW.MEASURE)
     return function (arr::AbstractArray{T}) where {T}
         copy!(buf, OffsetArrays.no_offset_view(arr))
         return plan(buf)

From 2988c21ae086fb00f349d6a027ca262ea6b2bcd3 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 7 May 2024 09:27:59 -0400
Subject: [PATCH 4/8] use test_logs for overflow warnings

---
 test/nd.jl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/nd.jl b/test/nd.jl
index 76c1006d..b57ae3f6 100644
--- a/test/nd.jl
+++ b/test/nd.jl
@@ -49,10 +49,11 @@ Base.zero(::Type{WrappedFloat}) = WrappedFloat(0.0)
     # Element-type widening (issue #17)
     v = fill(0xff, 10)
     kern = centered(fill(0xff, 3))
-    @info "Two warnings are expected"
-    # TODO: use @test_logs (:warn, r"Likely overflow or conversion error detected") match_mode=:any
-    # julia has an internal error currently on 1.10.2 that this hits https://github.com/JuliaLang/julia/pull/50759
-    @test_throws InexactError imfilter(v, kern)
+
+    @test_logs (:warn, r"Likely overflow or conversion error detected") begin
+        @test_throws InexactError imfilter(v, kern)
+    end
+
     vout = imfilter(UInt32, v, kern)
     @test eltype(vout) == UInt32
     @test all(x->x==0x0002fa03, vout)
@@ -121,7 +122,9 @@ end
     # issue #17
     img = fill(typemax(Int16), 10, 10)
     kern = centered(Int16[1 2 2 2 1])
-    @test_throws InexactError imfilter(img, kern)
+    @test_logs (:warn, r"Likely overflow or conversion error detected") begin
+        @test_throws InexactError imfilter(img, kern)
+    end
     ret = imfilter(Int32, img, kern)
     @test eltype(ret) == Int32
     @test all(x->x==262136, ret)

From 270e59f7d2d4c7ff0a7304eaf710a168afeeb0e0 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 7 May 2024 09:29:03 -0400
Subject: [PATCH 5/8] bump julia compat to 1.10

---
 .github/workflows/UnitTest.yml | 2 +-
 Project.toml                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
index 6d0f2912..9460199e 100644
--- a/.github/workflows/UnitTest.yml
+++ b/.github/workflows/UnitTest.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        julia-version: ['1.6', '1']
+        julia-version: ['1.10', '1']
         os: [ubuntu-latest, windows-latest, macOS-latest]
 
     steps:
diff --git a/Project.toml b/Project.toml
index 9ac19586..17a6ff43 100644
--- a/Project.toml
+++ b/Project.toml
@@ -36,7 +36,7 @@ RealFFTs = "1"
 StaticArrays = "0.10, 0.11, 0.12, 1.0"
 Statistics = "1"
 TiledIteration = "0.2, 0.3, 0.4, 0.5"
-julia = "1.6"
+julia = "1.10"
 
 [extras]
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"

From 9a290c9c67ce85d6988e6c9ad30f1f37cd788066 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 7 May 2024 09:51:20 -0400
Subject: [PATCH 6/8] expand TODO based on comment

---
 src/imfilter.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/imfilter.jl b/src/imfilter.jl
index a7fd760d..e1a3a44f 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -878,11 +878,17 @@ function filtfft(A, krn, planned_rfft1::Function, planned_rfft2::Function, plann
     B .*= conj!(complex(planned_rfft2(krn)))
     return real(planned_irfft(complex(B)))
 end
-# TODO: this does not work. See TODO below
+# TODO: this Colorant method does not work. See TODO below
 function filtfft(A::AbstractArray{C}, krn, planned_rfft1::Function, planned_rfft2::Function, planned_irfft::Function) where {C<:Colorant}
     Av, dims = channelview_dims(A)
     kernrs = kreshape(C, krn)
     B = complex(planned_rfft1(Av, dims)) # TODO: dims is not supported by planned_rfft1
+    # Quoting Tim Holy in https://github.com/JuliaImages/ImageFiltering.jl/pull/271/files#r1559210348
+    # I don't think dims can be a point of flexibility: these plans are specific to the
+    # memory layout of the array. (The planning explores various implementations and picks
+    # the fastest discovered; performance is strongly dependent on memory layout, so the
+    # choice for one layout may not be the same as another.) You'd have to create a plan
+    # specifically to the colorant array-type.
     B .*= conj!(complex(planned_rfft2(kernrs)))
     Avf = real(planned_irfft(complex(B)))
     return colorview(base_colorant_type(C){eltype(Avf)}, Avf)

From 7ff1a55c35ac4030ea325ce89f7d3550da6c2733 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 7 May 2024 10:05:23 -0400
Subject: [PATCH 7/8] add to docs

---
 docs/src/index.md | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index fd57e3f7..7f168f1d 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -13,7 +13,7 @@ processing.
 
 The main functions provided by this package are:
 
-| Function                 | Action         | 
+| Function                 | Action         |
 |:-------------------------|:---------------|
 |[`imfilter`](@ref)        | Filter a one, two or multidimensional array img with a kernel by computing their correlation |
 |[`imfilter!`](@ref)       | Filter an array img with kernel kernel by computing their correlation, storing the result in imgfilt |
@@ -25,7 +25,7 @@ The main functions provided by this package are:
 |[`findlocalminima`](@ref) | Returns the coordinates of elements whose value is smaller than all of their immediate neighbors |
 |[`findlocalmaxima`](@ref) | Returns the coordinates of elements whose value is larger than all of their immediate neighbors |
 
-Common kernels (filters) are organized in the `Kernel` and `KernelFactors` modules. 
+Common kernels (filters) are organized in the `Kernel` and `KernelFactors` modules.
 
 A common task in image processing and computer vision is computing
 image *gradients* (derivatives), for which there is the dedicated
@@ -80,6 +80,24 @@ transform (FFT).  By default, this choice is made based on kernel
 size. You can manually specify the algorithm using [`Algorithm.FFT()`](@ref)
 or [`Algorithm.FIR()`](@ref).
 
+#### Reusing FFT plans
+
+It is possible to reuse FFT plans if the operation is going to be done on the
+same array type and dimensions i.e. on each image of an image stack
+
+```julia
+using ImageFiltering, ComputationalResources
+imgstack = rand(Float64, 200, 100, 10)
+imgstack_filtered = similar(imgstack)
+
+kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
+fft_planned = CPU1(ImageFiltering.planned_fft(imgstack_filtered[:,:,1], kernel))
+
+for i in axes(imgstack, 3)
+    imfilter!(fft_planned, imgstack_filtered[:,:,i], imgstack[:,:,i], kernel)
+end
+```
+
 ### Feature: Multithreading
 
 If you launch Julia with `JULIA_NUM_THREADS=n` (where `n > 1`), then

From 9db8caf73954b4753af27d1460319cd139d9048e Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 7 May 2024 10:25:47 -0400
Subject: [PATCH 8/8] fix test coverage

---
 test/2d.jl | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/2d.jl b/test/2d.jl
index 25504630..9331699f 100644
--- a/test/2d.jl
+++ b/test/2d.jl
@@ -36,13 +36,12 @@ using ImageFiltering: borderinstance
     end
 end
 
-function supported_algs(img, kernel, border)
-    if eltype(img) isa AbstractFloat
-        (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT(), planned_fft(img, kernel, border))
-    else
-        # TODO: extend planned_fft to support other types
-        (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-    end
+# TODO: extend planned_fft to support more types than just floats
+function supported_algs(img::AbstractArray{T}, kernel, border) where {T<:AbstractFloat}
+    return (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT(), planned_fft(img, kernel, border))
+end
+function supported_algs(img::AbstractArray, kernel, border)
+    return (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
 end
 
 @testset "FIR/FFT" begin