Add GLFFT

2019-09-11 23:06:03 -07:00
parent 580e296e86
commit b26f11ae20
10 changed files with 3001 additions and 0 deletions
--- a/glfft/LICENSE_ORIGINAL
+++ b/glfft/LICENSE_ORIGINAL
@@ -0,0 +1,19 @@
 Copyright (c) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
--- a/glfft/glfft.cpp
+++ b/glfft/glfft.cpp
--- a/glfft/glfft.hpp
+++ b/glfft/glfft.hpp
@@ -0,0 +1,225 @@
 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef GLFFT_HPP__
 #define GLFFT_HPP__
 #include "glfft_interface.hpp"
 #include "glfft_common.hpp"
 #include "glfft_wisdom.hpp"
 #include <vector>
 #include <unordered_map>
 #include <limits>
 /// GLFFT doesn't try to preserve GL state in any way.
 /// E.g. SHADER_STORAGE_BUFFER bindings, programs bound, texture bindings, etc.
 /// Applications calling this library must expect that some GL state will be modified.
 /// No rendering state associated with graphics will be modified.
 namespace GLFFT
 {
 class FFT
 {
    public:
        /// @brief Creates a full FFT.
        ///
        /// All buffer allocation done by GLFFT will be done in constructor.
        /// Will throw if invalid parameters are passed.
        ///
        /// @param context       The graphics context.
        /// @param Nx            Number of samples in horizontal dimension.
        /// @param Ny            Number of samples in vertical dimension.
        /// @param type          The transform type.
        /// @param direction     Forward, inverse or inverse with convolution.
        ///                      For real-to-complex and complex-to-real transforms, the
        ///                      transform type must match.
        /// @param input_target  GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
        /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
        /// @param cache         A program cache for caching the GLFFT programs created.
        /// @param options       FFT options such as performance related parameters and types.
        /// @param wisdom        GLFFT wisdom which can override performance related options
        ///                      (options.performance is used as a fallback).
        FFT(Context *context, unsigned Nx, unsigned Ny,
                Type type, Direction direction, Target input_target, Target output_target,
                std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
                const FFTWisdom &wisdom = FFTWisdom());
        /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
        ///
        /// All buffer allocation done by GLFFT will be done in constructor.
        /// Will throw if invalid parameters are passed.
        ///
        /// @param context       The graphics context.
        /// @param Nx            Number of samples in horizontal dimension.
        /// @param Ny            Number of samples in vertical dimension.
        /// @param radix         FFT radix to test.
        /// @param p             Accumulated p factor. If 1, "first pass" mode is tested, otherwise, generic FFT stages.
        /// @param mode          The transform mode.
        /// @param input_target  GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
        /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
        /// @param cache         A program cache for caching the GLFFT programs created.
        /// @param options       FFT options such as performance related parameters and types.
        FFT(Context *context, unsigned Nx, unsigned Ny, unsigned radix, unsigned p,
                Mode mode, Target input_target, Target output_target,
                std::shared_ptr<ProgramCache> cache, const FFTOptions &options);
        /// @brief Process the FFT.
        ///
        /// The type of object passed here must match what FFT was initialized with.
        ///
        /// @param cmd       Command buffer for issuing dispatch commands.
        /// @param output    Output buffer or image.
        ///                  NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
        /// @param input     Input buffer or texture.
        /// @param input_aux If using convolution transform type,
        ///                  the content of input and input_aux will be multiplied together.
        void process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux = nullptr);
        /// @brief Run process() multiple times, timing the results.
        ///
        /// Mostly used internally by GLFFT wisdom, glfft_cli's bench, and so on.
        ///
        /// @param context                  The graphics context.
        /// @param output                   Output buffer or image.
        ///                                 NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
        /// @param input                    Input buffer or texture.
        /// @param warmup_iterations        Number of iterations to run to "warm" up GL, ensures we don't hit
        ///                                 recompilations or similar when benching.
        /// @param iterations               Number of iterations to run the benchmark.
        ///                                 Each iteration will ensure timing with a glFinish() followed by timing.
        /// @param dispatches_per_iteration Number of calls to process() we should do per iteration.
        /// @param max_time                 The max time the benchmark should run. Will be checked after each iteration is complete.
        ///
        /// @returns Average GPU time per process() call.
        double bench(Context *context, Resource *output, Resource *input,
                unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration,
                double max_time = std::numeric_limits<double>::max());
        /// @brief Returns cost for a process() call. Only used for debugging.
        double get_cost() const { return cost; }
        /// @brief Returns number of passes (glDispatchCompute) in a process() call.
        unsigned get_num_passes() const { return passes.size(); }
        /// @brief Returns Nx.
        unsigned get_dimension_x() const { return size_x; }
        /// @brief Returns Ny.
        unsigned get_dimension_y() const { return size_y; }
        /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures.
        ///
        /// By default, these values are 0.5 / size (samples in the center of texel (0, 0)).
        /// Scale is 1.0 / size, so it steps one texel for each coordinate in the FFT transform.
        /// Setting this to something custom is useful to get downsampling with GL_LINEAR -> FFT transform
        /// without having to downsample the texture first, then FFT.
        void set_texture_offset_scale(float offset_x, float offset_y, float scale_x, float scale_y)
        {
            texture.offset_x = offset_x;
            texture.offset_y = offset_y;
            texture.scale_x = scale_x;
            texture.scale_y = scale_y;
        }
        /// @brief Set binding range for input.
        ///
        /// If input is an SSBO, set a custom binding range to be passed to glBindBufferRange.
        /// By default, the entire buffer is bound.
        void set_input_buffer_range(size_t offset, size_t size)
        {
            ssbo.input.offset = offset;
            ssbo.input.size = size;
        }
        /// @brief Set binding range for input_aux.
        ///
        /// If input_aux is an SSBO, set a custom binding range to be passed to glBindBufferRange.
        /// By default, the entire buffer is bound.
        void set_input_aux_buffer_range(size_t offset, size_t size)
        {
            ssbo.input_aux.offset = offset;
            ssbo.input_aux.size = size;
        }
        /// @brief Set binding range for output.
        ///
        /// If output buffer is an SSBO, set a custom binding range to be passed to glBindBufferRange.
        /// By default, the entire buffer is bound.
        void set_output_buffer_range(size_t offset, size_t size)
        {
            ssbo.output.offset = offset;
            ssbo.output.size = size;
        }
        /// @brief Set samplers for input textures.
        ///
        /// Set sampler objects to be used for input and input_aux if textures are used as input.
        /// By default, sampler object 0 will be used (inheriting sampler parameters from the texture object itself).
        void set_samplers(Sampler *sampler0, Sampler *sampler1 = nullptr)
        {
            texture.samplers[0] = sampler0;
            texture.samplers[1] = sampler1;
        }
    private:
        Context *context;
        struct Pass
        {
            Parameters parameters;
            unsigned workgroups_x;
            unsigned workgroups_y;
            unsigned uv_scale_x;
            unsigned stride;
            Program *program;
        };
        double cost = 0.0;
        std::unique_ptr<Buffer> temp_buffer;
        std::unique_ptr<Buffer> temp_buffer_image;
        std::vector<Pass> passes;
        std::shared_ptr<ProgramCache> cache;
        std::unique_ptr<Program> build_program(const Parameters &params);
        static std::string load_shader_string(const char *path);
        static void store_shader_string(const char *path, const std::string &source);
        Program* get_program(const Parameters &params);
        struct
        {
            float offset_x = 0.0f, offset_y = 0.0f, scale_x = 1.0f, scale_y = 1.0f;
            Sampler *samplers[2] = { nullptr, nullptr };
        } texture;
        struct
        {
            struct
            {
                size_t offset = 0;
                size_t size = 0;
            } input, input_aux, output;
        } ssbo;
        unsigned size_x, size_y;
 };
 }
 #endif
--- a/glfft/glfft_common.hpp
+++ b/glfft/glfft_common.hpp
@@ -0,0 +1,178 @@
 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 // For the most part used by the implementation.
 #ifndef GLFFT_COMMON_HPP__
 #define GLFFT_COMMON_HPP__
 #include "glfft_interface.hpp"
 #include <functional>
 #include <cstddef>
 #include <cstdlib>
 #include <string>
 #include <cstring>
 #include <memory>
 #include <unordered_map>
 namespace GLFFT
 {
 enum Direction
 {
    /// Forward FFT transform.
    Forward = -1,
    /// Inverse FFT transform, but with two inputs (in frequency domain) which are multiplied together
    /// for convolution.
    InverseConvolve = 0,
    /// Inverse FFT transform.
    Inverse = 1
 };
 enum Mode
 {
    Horizontal,
    HorizontalDual,
    Vertical,
    VerticalDual,
    ResolveRealToComplex,
    ResolveComplexToReal,
 };
 enum Type
 {
    /// Regular complex-to-complex transform.
    ComplexToComplex,
    /// Complex-to-complex dual transform where the complex value is four-dimensional,
    /// i.e. a vector of two complex values. Typically used to transform RGBA data.
    ComplexToComplexDual,
    /// Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex samples.
    ComplexToReal,
    /// Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N complex samples.
    RealToComplex
 };
 enum Target
 {
    /// GL_SHADER_STORAGE_BUFFER
    SSBO,
    /// Textures, when used as output, type is determined by transform type.
    /// ComplexToComplex / RealToComplex -> GL_RG16F
    /// ComplexToComplexDual -> GL_RGBA16F
    Image,
    /// Real-valued (single component) textures, when used as output, type is determined by transform type.
    /// ComplexToReal -> GL_R32F (because GLES 3.1 doesn't have GL_R16F image type).
    ImageReal
 };
 struct Parameters
 {
    unsigned workgroup_size_x;
    unsigned workgroup_size_y;
    unsigned workgroup_size_z;
    unsigned radix;
    unsigned vector_size;
    Direction direction;
    Mode mode;
    Target input_target;
    Target output_target;
    bool p1;
    bool shared_banked;
    bool fft_fp16, input_fp16, output_fp16;
    bool fft_normalize;
    bool operator==(const Parameters &other) const
    {
        return std::memcmp(this, &other, sizeof(Parameters)) == 0;
    }
 };
 /// @brief Options for FFT implementation.
 /// Defaults for performance as conservative.
 struct FFTOptions
 {
    struct Performance
    {
        /// Workgroup size used in layout(local_size_x).
        /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
        /// FFT constructor will throw in this case.
        unsigned workgroup_size_x = 4;
        /// Workgroup size used in layout(local_size_x).
        /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
        /// FFT constructor will throw in this case.
        unsigned workgroup_size_y = 1;
        /// Vector size. Very GPU dependent. "Scalar" GPUs prefer 2 here, vector GPUs prefer 4 (and maybe 8).
        unsigned vector_size = 2;
        /// Whether to use banked shared memory or not.
        /// Desktop GPUs prefer true here, false for mobile in general.
        bool shared_banked = false;
    } performance;
    struct Type
    {
        /// Whether internal shader should be mediump float.
        bool fp16 = false;
        /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
        bool input_fp16 = false;
        /// Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
        bool output_fp16 = false;
        /// Whether to apply 1 / N normalization factor.
        bool normalize = false;
    } type;
 };
 }
 namespace std
 {
    template<>
    struct hash<GLFFT::Parameters>
    {
        std::size_t operator()(const GLFFT::Parameters &params) const
        {
            std::size_t h = 0;
            hash<uint8_t> hasher;
            for (std::size_t i = 0; i < sizeof(GLFFT::Parameters); i++)
            {
                h ^= hasher(reinterpret_cast<const uint8_t*>(&params)[i]);
            }
            return h;
        }
    };
 }
 namespace GLFFT
 {
 class ProgramCache
 {
    public:
        Program* find_program(const Parameters &parameters) const;
        void insert_program(const Parameters &parameters, std::unique_ptr<Program> program);
        size_t cache_size() const { return programs.size(); }
    private:
        std::unordered_map<Parameters, std::unique_ptr<Program>> programs;
 };
 }
 #endif
--- a/glfft/glfft_gl_api_headers.hpp
+++ b/glfft/glfft_gl_api_headers.hpp
@@ -0,0 +1,6 @@
 /* Let GLFFT use GLava's headers */
 #define GLFFT_GLSL_LANG_STRING "#version 430 core\n"
 extern "C" {
    #include "../glava/glad.h"
 }
--- a/glfft/glfft_gl_interface.cpp
+++ b/glfft/glfft_gl_interface.cpp
@@ -0,0 +1,310 @@
 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "glfft_gl_interface.hpp"
 #ifdef GLFFT_GL_DEBUG
 #include "glfft_validate.hpp"
 #endif
 #include <cstdarg>
 #include <cstring>
 #include <vector>
 using namespace GLFFT;
 using namespace std;
 GLCommandBuffer GLContext::static_command_buffer;
 void GLCommandBuffer::bind_program(Program *program)
 {
    glUseProgram(program ? static_cast<GLProgram*>(program)->name : 0);
 }
 void GLCommandBuffer::bind_storage_texture(unsigned binding, Texture *texture, Format format)
 {
    glBindImageTexture(binding, static_cast<GLTexture*>(texture)->name,
            0, GL_FALSE, 0, GL_WRITE_ONLY, convert(format));
 }
 void GLCommandBuffer::bind_texture(unsigned binding, Texture *texture)
 {
    glActiveTexture(GL_TEXTURE0 + binding);
    glBindTexture(GL_TEXTURE_2D, static_cast<GLTexture*>(texture)->name);
 }
 void GLCommandBuffer::bind_sampler(unsigned binding, Sampler *sampler)
 {
    glBindSampler(binding, sampler ? static_cast<GLSampler*>(sampler)->name : 0);
 }
 void GLCommandBuffer::bind_storage_buffer(unsigned binding, Buffer *buffer)
 {
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name);
 }
 void GLCommandBuffer::bind_storage_buffer_range(unsigned binding, size_t offset, size_t size, Buffer *buffer)
 {
    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name, offset, size);
 }
 void GLCommandBuffer::dispatch(unsigned x, unsigned y, unsigned z)
 {
    glDispatchCompute(x, y, z);
 }
 void GLCommandBuffer::barrier(Buffer*)
 {
    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 }
 void GLCommandBuffer::barrier(Texture*)
 {
    glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
 }
 void GLCommandBuffer::barrier()
 {
    glMemoryBarrier(GL_ALL_BARRIER_BITS);
 }
 void GLCommandBuffer::push_constant_data(unsigned binding, const void *data, size_t size)
 {
    glBindBufferBase(GL_UNIFORM_BUFFER, binding, ubos[ubo_index]);
    void *ptr = glMapBufferRange(GL_UNIFORM_BUFFER,
            0, CommandBuffer::MaxConstantDataSize,
            GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
    if (ptr)
    {
        std::memcpy(ptr, data, size);
        glUnmapBuffer(GL_UNIFORM_BUFFER);
    }
    if (++ubo_index >= ubo_count)
        ubo_index = 0;
 }
 CommandBuffer* GLContext::request_command_buffer()
 {
    if (!initialized_ubos)
    {
        glGenBuffers(MaxBuffersRing, ubos);
        for (auto &ubo : ubos)
        {
            glBindBuffer(GL_UNIFORM_BUFFER, ubo);
            glBufferData(GL_UNIFORM_BUFFER, CommandBuffer::MaxConstantDataSize, nullptr, GL_STREAM_DRAW);
        }
        static_command_buffer.set_constant_data_buffers(ubos, MaxBuffersRing);
        initialized_ubos = true;
    }
    return &static_command_buffer;
 }
 void GLContext::submit_command_buffer(CommandBuffer*)
 {}
 void GLContext::wait_idle()
 {
    glFinish();
 }
 unique_ptr<Texture> GLContext::create_texture(const void *initial_data,
        unsigned width, unsigned height,
        Format format)
 {
    return unique_ptr<Texture>(new GLTexture(initial_data, width, height, format));
 }
 unique_ptr<Buffer> GLContext::create_buffer(const void *initial_data, size_t size, AccessMode access)
 {
    return unique_ptr<Buffer>(new GLBuffer(initial_data, size, access));
 }
 unique_ptr<Program> GLContext::compile_compute_shader(const char *source)
 {
 #ifdef GLFFT_GL_DEBUG
    if (!validate_glsl_source(source))
        return nullptr;
 #endif
    GLuint program = glCreateProgram();
    if (!program)
    {
        return nullptr;
    }
    GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
    const char *sources[] = { GLFFT_GLSL_LANG_STRING, source };
    glShaderSource(shader, 2, sources, NULL);
    glCompileShader(shader);
    GLint status;
    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
    if (status == GL_FALSE)
    {
        GLint len;
        GLsizei out_len;
        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len);
        vector<char> buf(len);
        glGetShaderInfoLog(shader, len, &out_len, buf.data());
        log("GLFFT: Shader log:\n%s\n\n", buf.data());
        glDeleteShader(shader);
        glDeleteProgram(program);
        return 0;
    }
    glAttachShader(program, shader);
    glLinkProgram(program);
    glDeleteShader(shader);
    glGetProgramiv(program, GL_LINK_STATUS, &status);
    if (status == GL_FALSE)
    {
        GLint len;
        GLsizei out_len;
        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &len);
        vector<char> buf(len);
        glGetProgramInfoLog(program, len, &out_len, buf.data());
        log("Program log:\n%s\n\n", buf.data());
        glDeleteProgram(program);
        glDeleteShader(shader);
        return nullptr;
    }
    return unique_ptr<Program>(new GLProgram(program));
 }
 void GLContext::log(const char *fmt, ...)
 {
    char buffer[4 * 1024];
    va_list va;
    va_start(va, fmt);
    vsnprintf(buffer, sizeof(buffer), fmt, va);
    va_end(va);
    glfft_log("%s", buffer);
 }
 double GLContext::get_time()
 {
    return glfft_time();
 }
 unsigned GLContext::get_max_work_group_threads()
 {
    GLint value;
    glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &value);
    return value;
 }
 const char* GLContext::get_renderer_string()
 {
    return reinterpret_cast<const char*>(glGetString(GL_RENDERER));
 }
 const void* GLContext::map(Buffer *buffer, size_t offset, size_t size)
 {
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
    const void *ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, size, GL_MAP_READ_BIT);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
    return ptr;
 }
 void GLContext::unmap(Buffer *buffer)
 {
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
    glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
 }
 void GLContext::teardown()
 {
    if (initialized_ubos)
        glDeleteBuffers(MaxBuffersRing, ubos);
    initialized_ubos = false;
 }
 GLContext::~GLContext()
 {
    teardown();
 }
 GLTexture::GLTexture(const void *initial_data,
        unsigned width, unsigned height,
        Format format)
 {
    glGenTextures(1, &name);
    glBindTexture(GL_TEXTURE_2D, name);
    glTexStorage2D(GL_TEXTURE_2D, 1, convert(format), width, height);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    if (initial_data)
    {
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height,
                convert_format(format), convert_type(format), initial_data);
    }
    glBindTexture(GL_TEXTURE_2D, 0);
 }
 GLTexture::~GLTexture()
 {
    if (owned)
        glDeleteTextures(1, &name);
 }
 GLBuffer::GLBuffer(const void *initial_data, size_t size, AccessMode access)
 {
    glGenBuffers(1, &name);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, name);
    glBufferData(GL_SHADER_STORAGE_BUFFER, size, initial_data, convert(access));
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
 }
 GLBuffer::~GLBuffer()
 {
    if (owned)
        glDeleteBuffers(1, &name);
 }
 GLProgram::GLProgram(GLuint name)
    : name(name)
 {}
 GLProgram::~GLProgram()
 {
    if (name != 0)
    {
        glDeleteProgram(name);
    }
 }
 GLSampler::~GLSampler()
 {
    if (name != 0)
    {
        glDeleteSamplers(1, &name);
    }
 }
--- a/glfft/glfft_gl_interface.hpp
+++ b/glfft/glfft_gl_interface.hpp
@@ -0,0 +1,258 @@
 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef GLFFT_GL_INTERFACE_HPP__
 #define GLFFT_GL_INTERFACE_HPP__
 #include "glfft_interface.hpp"
 #include "glfft_gl_api_headers.hpp"
 /* GLava additions (POSIX) */
 extern "C" {
    #include <time.h>
    #include <stdarg.h>
    #include <stdlib.h>
    #include <string.h>
    #include <error.h>
 }
 #ifndef GLFFT_GLSL_LANG_STRING
 #error GLFFT_GLSL_LANG_STRING must be defined to e.g. "#version 310 es\n" or "#version 430 core\n".
 #endif
 #ifndef GLFFT_LOG_OVERRIDE
 void glfft_log(const char *fmt, ...) {
    va_list l;
    va_start(l, fmt);
    vfprintf(stdout, fmt, l);
    va_end(l);
 }
 #else
 #define glfft_log GLFFT_LOG_OVERRIDE
 #endif
 #ifndef GLFFT_TIME_OVERRIDE
 double glfft_time() {
    struct timespec tv;
    if (clock_gettime(CLOCK_REALTIME, &tv)) {
        fprintf(stderr, "clock_gettime(CLOCK_REALTIME, ...): %s\n", strerror(errno));
    }
    return (double) tv.tv_sec + ((double) tv.tv_nsec / 1000000000.0);
 }
 #else
 #define glfft_time GLFFT_TIME_OVERRIDE
 #endif
 namespace GLFFT
 {
    class GLContext;
    class GLTexture : public Texture
    {
        public:
            friend class GLContext;
            friend class GLCommandBuffer;
            ~GLTexture();
            GLTexture(GLuint obj) : name(obj), owned(false) {}
            GLuint get() const { return name; }
        private:
            GLTexture(const void *initial_data,
                    unsigned width, unsigned height,
                    Format format);
            GLuint name;
            bool owned = true;
    };
    // Not really used by test and bench code, but can be useful for API users.
    class GLSampler : public Sampler
    {
        public:
            friend class GLContext;
            friend class GLCommandBuffer;
            ~GLSampler();
            GLSampler(GLuint obj) : name(obj) {}
            GLuint get() const { return name; }
        private:
            GLuint name;
    };
    class GLBuffer : public Buffer
    {
        public:
            friend class GLContext;
            friend class GLCommandBuffer;
            ~GLBuffer();
            GLBuffer(GLuint obj) : name(obj), owned(false) {}
            GLuint get() const { return name; }
        private:
            GLuint name;
            GLBuffer(const void *initial_data, size_t size, AccessMode access);
            bool owned = true;
    };
    class GLProgram : public Program
    {
        public:
            friend class GLContext;
            friend class GLCommandBuffer;
            ~GLProgram();
            GLuint get() const { return name; }
        private:
            GLProgram(GLuint name);
            GLuint name;
    };
    class GLCommandBuffer : public CommandBuffer
    {
        public:
            ~GLCommandBuffer() = default;
            void set_constant_data_buffers(const GLuint *ubos, unsigned count)
            {
                this->ubos = ubos;
                ubo_index = 0;
                ubo_count = count;
            }
            void bind_program(Program *program) override;
            void bind_storage_texture(unsigned binding, Texture *texture, Format format) override;
            void bind_texture(unsigned binding, Texture *texture) override;
            void bind_sampler(unsigned binding, Sampler *sampler) override;
            void bind_storage_buffer(unsigned binding, Buffer *texture) override;
            void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) override;
            void dispatch(unsigned x, unsigned y, unsigned z) override;
            void barrier(Buffer *buffer) override;
            void barrier(Texture *buffer) override;
            void barrier() override;
            void push_constant_data(unsigned binding, const void *data, size_t size) override;
        private:
            const GLuint *ubos = nullptr;
            unsigned ubo_count = 0;
            unsigned ubo_index = 0;
    };
    class GLContext : public Context
    {
        public:
            ~GLContext();
            std::unique_ptr<Texture> create_texture(const void *initial_data,
                    unsigned width, unsigned height,
                    Format format) override;
            std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) override;
            std::unique_ptr<Program> compile_compute_shader(const char *source) override;
            CommandBuffer* request_command_buffer() override;
            void submit_command_buffer(CommandBuffer *cmd) override;
            void wait_idle() override;
            const char* get_renderer_string() override;
            void log(const char *fmt, ...) override;
            double get_time() override;
            unsigned get_max_work_group_threads() override;
            const void* map(Buffer *buffer, size_t offset, size_t size) override;
            void unmap(Buffer *buffer) override;
            // Not supported in GLES, so override when creating platform-specific context.
            bool supports_texture_readback() override { return false; }
            void read_texture(void*, Texture*, Format) override {}
        protected:
            void teardown();
        private:
            static GLCommandBuffer static_command_buffer;
            enum { MaxBuffersRing = 256 };
            GLuint ubos[MaxBuffersRing];
            bool initialized_ubos = false;
    };
    static inline GLenum convert(AccessMode mode)
    {
        switch (mode)
        {
            case AccessStreamCopy: return GL_STREAM_COPY;
            case AccessStaticCopy: return GL_STATIC_COPY;
            case AccessStreamRead: return GL_STREAM_READ;
        }
        return 0;
    }
    static inline GLenum convert(Format format)
    {
        switch (format)
        {
            case FormatR16G16B16A16Float: return GL_RGBA16F;
            case FormatR32G32B32A32Float: return GL_RGBA32F;
            case FormatR32Float: return GL_R32F;
            case FormatR16G16Float: return GL_RG16F;
            case FormatR32G32Float: return GL_RG32F;
            case FormatR32Uint: return GL_R32UI;
            case FormatUnknown: return 0;
        }
        return 0;
    }
    static inline GLenum convert_format(Format format)
    {
        switch (format)
        {
            case FormatR16G16Float: return GL_RG;
            case FormatR32G32Float: return GL_RG;
            case FormatR16G16B16A16Float: return GL_RGBA;
            case FormatR32G32B32A32Float: return GL_RGBA;
            case FormatR32Float: return GL_RED;
            case FormatR32Uint: return GL_RED_INTEGER;
            case FormatUnknown: return 0;
        }
        return 0;
    }
    static inline GLenum convert_type(Format format)
    {
        switch (format)
        {
            case FormatR16G16Float: return GL_HALF_FLOAT;
            case FormatR16G16B16A16Float: return GL_HALF_FLOAT;
            case FormatR32Float: return GL_FLOAT;
            case FormatR32G32Float: return GL_FLOAT;
            case FormatR32G32B32A32Float: return GL_FLOAT;
            case FormatR32Uint: return GL_UNSIGNED_INT;
            case FormatUnknown: return 0;
        }
        return 0;
    }
 }
 #endif
--- a/glfft/glfft_interface.hpp
+++ b/glfft/glfft_interface.hpp
@@ -0,0 +1,131 @@
 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef GLFFT_INTERFACE_HPP__
 #define GLFFT_INTERFACE_HPP__
 #include <memory>
 namespace GLFFT
 {
    class Context;
    class Resource
    {
        public:
            virtual ~Resource() = default;
            // Non-movable, non-copyable to make things simpler.
            Resource(Resource&&) = delete;
            void operator=(const Resource&) = delete;
        protected:
            Resource() = default;
    };
    class Texture : public Resource {};
    class Sampler : public Resource {};
    class Buffer : public Resource {};
    class Program
    {
        public:
            virtual ~Program() = default;
        protected:
            friend class Context;
            Program() = default;
    };
    enum AccessMode
    {
        AccessStreamCopy,
        AccessStaticCopy,
        AccessStreamRead
    };
    enum Format
    {
        FormatUnknown,
        FormatR16G16B16A16Float,
        FormatR32G32B32A32Float,
        FormatR32G32Float,
        FormatR32Float,
        FormatR16G16Float,
        FormatR32Uint
    };
    class CommandBuffer;
    class Context
    {
        public:
            virtual ~Context() = default;
            virtual std::unique_ptr<Texture> create_texture(const void *initial_data,
                    unsigned width, unsigned height,
                    Format format) = 0;
            virtual std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) = 0;
            virtual std::unique_ptr<Program> compile_compute_shader(const char *source) = 0;
            virtual CommandBuffer* request_command_buffer() = 0;
            virtual void submit_command_buffer(CommandBuffer *cmd) = 0;
            virtual void wait_idle() = 0;
            virtual const char* get_renderer_string() = 0;
            virtual void log(const char *fmt, ...) = 0;
            virtual double get_time() = 0;
            virtual unsigned get_max_work_group_threads() = 0;
            virtual const void* map(Buffer *buffer, size_t offset, size_t size) = 0;
            virtual void unmap(Buffer *buffer) = 0;
            virtual bool supports_texture_readback() = 0;
            virtual void read_texture(void *buffer, Texture *texture, Format format) = 0;
        protected:
            Context() = default;
    };
    class CommandBuffer
    {
        public:
            virtual ~CommandBuffer() = default;
            virtual void bind_program(Program *program) = 0;
            virtual void bind_storage_texture(unsigned binding, Texture *texture, Format format) = 0;
            virtual void bind_texture(unsigned binding, Texture *texture) = 0;
            virtual void bind_sampler(unsigned binding, Sampler *sampler) = 0;
            virtual void bind_storage_buffer(unsigned binding, Buffer *texture) = 0;
            virtual void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) = 0;
            virtual void dispatch(unsigned x, unsigned y, unsigned z) = 0;
            virtual void barrier(Buffer *buffer) = 0;
            virtual void barrier(Texture *buffer) = 0;
            virtual void barrier() = 0;
            enum { MaxConstantDataSize = 64 };
            virtual void push_constant_data(unsigned binding, const void *data, size_t size) = 0;
        protected:
            CommandBuffer() = default;
    };
 }
 #endif
--- a/glfft/glfft_wisdom.cpp
+++ b/glfft/glfft_wisdom.cpp
@@ -0,0 +1,600 @@
 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "glfft_wisdom.hpp"
 #include "glfft_interface.hpp"
 #include "glfft.hpp"
 #include <utility>
 /* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */
 #ifdef GLFFT_SERIALIZATION
 #include "rapidjson/reader.h"
 #include "rapidjson/prettywriter.h"
 #include "rapidjson/stringbuffer.h"
 #include "rapidjson/document.h"
 using namespace rapidjson;
 #endif
 #ifdef GLFFT_CLI_ASYNC
 #include "glfft_cli.hpp"
 #endif
 using namespace std;
 using namespace GLFFT;
 FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context)
 {
    FFTStaticWisdom res;
    const char *renderer = context->get_renderer_string();
    unsigned threads = context->get_max_work_group_threads();
    if (strstr(renderer, "GeForce") || strstr(renderer, "Quadro"))
    {
        context->log("Detected GeForce/Quadro GPU.\n");
        res.min_workgroup_size = 32; // Warp threads.
        res.min_workgroup_size_shared = 32;
        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
        res.min_vector_size = 2;
        res.max_vector_size = 2;
        res.shared_banked = FFTStaticWisdom::True;
    }
    else if (strstr(renderer, "Radeon"))
    {
        context->log("Detected Radeon GPU.\n");
        res.min_workgroup_size = 64; // Wavefront threads (GCN).
        res.min_workgroup_size_shared = 128;
        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
        // TODO: Find if we can restrict this to 2 or 4 always.
        res.min_vector_size = 2;
        res.max_vector_size = 4;
        res.shared_banked = FFTStaticWisdom::True;
    }
    else if (strstr(renderer, "Mali"))
    {
        context->log("Detected Mali GPU.\n");
        res.min_workgroup_size = 4;
        res.min_workgroup_size_shared = 4;
        res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.
        res.min_vector_size = 4;
        res.max_vector_size = 4;
        res.shared_banked = FFTStaticWisdom::False;
    }
    // TODO: Add more GPUs.
    return res;
 }
 pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(
        Context *context, unsigned Nx, unsigned Ny, unsigned radix,
        Mode mode, Target input_target, Target output_target,
        const FFTOptions::Type &type)
 {
    WisdomPass pass = {
        {
            Nx, Ny, radix, mode, input_target, output_target,
            type,
        },
        0.0,
    };
    auto itr = library.find(pass);
    if (itr != end(library))
    {
        return make_pair(itr->first.cost, itr->second);
    }
    else
    {
        auto result = study(context, pass, type);
        pass.cost = result.first;
        library[pass] = result.second;
        return result;
    }
 }
 void FFTWisdom::learn_optimal_options_exhaustive(Context *context,
        unsigned Nx, unsigned Ny,
        Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)
 {
    bool learn_resolve = type == ComplexToReal || type == RealToComplex;
    Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;
    Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
    // Create wisdom for horizontal transforms and vertical transform.
    static const unsigned radices[] = { 4, 8, 16, 64 };
    for (auto radix : radices)
    {
        try
        {
            // If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.
            // Learn plain transforms.
            if (Ny > 1)
            {
                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);
            }
            learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);
            // Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with
            // block interleave and larger WG_Y sizes.
            if (input_target != SSBO)
            {
                if (Ny > 1)
                {
                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);
                }
                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);
            }
            if (output_target != SSBO)
            {
                if (Ny > 1)
                {
                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);
                }
                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);
            }
        }
 #ifdef GLFFT_CLI_ASYNC
        catch (const AsyncCancellation &)
        {
            throw;
        }
 #endif
        catch (...)
        {
            // If our default options cannot successfully create the radix pass (i.e. throws),
            // just ignore it for purpose of creating wisdom.
        }
    }
    auto resolve_type = fft_type;
    resolve_type.input_fp16 = resolve_type.output_fp16;
    Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
    Target resolve_input_target = SSBO;
    // If we have C2R Nx1 transform, the first pass is resolve, so use those types.
    if (type == ComplexToReal && Ny == 1)
    {
        resolve_type = fft_type;
        resolve_input_target = input_target;
    }
    // If we need to do a resolve pass, train this case as well.
    if (learn_resolve)
    {
        try
        {
            // If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.
            if (Ny == 1 && resolve_mode == ResolveRealToComplex)
            {
                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
            }
            else
            {
                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);
            }
        }
 #ifdef GLFFT_CLI_ASYNC
        catch (const AsyncCancellation &)
        {
            throw;
        }
 #endif
        catch (...)
        {
            // If our default options cannot successfully create the radix pass (i.e. throws),
            // just ignore it for purpose of creating wisdom.
        }
    }
 }
 double FFTWisdom::bench(Context *context, Resource *output, Resource *input,
        const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const
 {
    FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,
            pass.pass.mode, pass.pass.input_target, pass.pass.output_target,
            cache, options);
    return fft.bench(context,
            output, input, params.warmup, params.iterations, params.dispatches, params.timeout);
 }
 static inline unsigned mode_to_size(Mode mode)
 {
    switch (mode)
    {
        case VerticalDual:
        case HorizontalDual:
        case ResolveRealToComplex:
        case ResolveComplexToReal:
            return 4;
        default:
            return 2;
    }
 }
 std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const
 {
    auto cache = make_shared<ProgramCache>();
    unique_ptr<Resource> output;
    unique_ptr<Resource> input;
    unsigned mode_size = mode_to_size(pass.pass.mode);
    vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);
    if (pass.pass.input_target == SSBO)
    {
        input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy);
    }
    else
    {
        Format format = FormatUnknown;
        unsigned Nx = pass.pass.Nx;
        unsigned Ny = pass.pass.Ny;
        switch (pass.pass.mode)
        {
            case VerticalDual:
            case HorizontalDual:
                format = FormatR32G32B32A32Float;
                break;
            case Vertical:
            case Horizontal:
                format = FormatR32G32Float;
                break;
            case ResolveComplexToReal:
                format = FormatR32G32Float;
                Nx *= 2;
                break;
            default:
                throw logic_error("Invalid input mode.\n");
        }
        input = context->create_texture(tmp.data(), Nx, Ny, format);
    }
    if (pass.pass.output_target == SSBO)
    {
        output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy);
    }
    else
    {
        Format format = FormatUnknown;
        unsigned Nx = pass.pass.Nx;
        unsigned Ny = pass.pass.Ny;
        switch (pass.pass.mode)
        {
            case VerticalDual:
            case HorizontalDual:
                format = FormatR32G32B32A32Float;
                break;
            case Vertical:
            case Horizontal:
                format = FormatR32G32Float;
                break;
            case ResolveRealToComplex:
                format = FormatR32G32Float;
                Nx *= 2;
                break;
            default:
                throw logic_error("Invalid output mode.\n");
        }
        output = context->create_texture(nullptr, Nx, Ny, format);
    }
    // Exhaustive search, look for every sensible combination, and find fastest parameters.
    // Get initial best cost with defaults.
    FFTOptions::Performance best_perf;
    double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache);
    static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };
    static const unsigned vector_size_values[] = { 2, 4, 8 };
    static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
    static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };
    bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex;
    bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual;
    unsigned bench_count = 0;
    for (auto shared_banked : shared_banked_values)
    {
        // Useless test, since shared banked is only relevant for radix 16/64.
        if (pass.pass.radix < 16 && shared_banked)
        {
            continue;
        }
        bool fair_shared_banked = (pass.pass.radix < 16) ||
                                  (static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
                                  (shared_banked == static_wisdom.shared_banked);
        if (!fair_shared_banked)
        {
            continue;
        }
        for (auto vector_size : vector_size_values)
        {
            // Resolve passes currently only support vector size 2. Shared banked makes no sense either.
            if (test_resolve && (vector_size != 2 || shared_banked))
            {
                continue;
            }
            // We can only use vector_size 8 with FP16.
            if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16))
            {
                continue;
            }
            // Makes little sense to test since since vector_size will be bumped to 4 anyways.
            if (test_dual && vector_size < 4)
            {
                continue;
            }
            for (auto workgroup_size_x : workgroup_size_x_values)
            {
                for (auto workgroup_size_y : workgroup_size_y_values)
                {
                    unsigned workgroup_size  = workgroup_size_x * workgroup_size_y;
                    unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :
                                                                          static_wisdom.min_workgroup_size;
                    unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
                    unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;
                    bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
                                               workgroup_size >= min_workgroup_size;
                    if (pass.pass.Ny == 1 && workgroup_size_y > 1)
                    {
                        fair_workgroup_size = false;
                    }
                    if (!fair_workgroup_size)
                    {
                        continue;
                    }
                    // If we have dual mode, accept vector sizes larger than max.
                    bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
                                                             vector_size >= min_vector_size);
                    if (!fair_vector_size)
                    {
                        continue;
                    }
                    FFTOptions::Performance perf;
                    perf.shared_banked = shared_banked;
                    perf.vector_size = vector_size;
                    perf.workgroup_size_x = workgroup_size_x;
                    perf.workgroup_size_y = workgroup_size_y;
                    try
                    {
                        // If workgroup sizes are too big for our test, this will throw.
                        double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache);
                        bench_count++;
 #if 1
                        context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);
                        context->log("  Width:            %4u\n", pass.pass.Nx);
                        context->log("  Height:           %4u\n", pass.pass.Ny);
                        context->log("  Shared banked:     %3s\n", shared_banked ? "yes" : "no");
                        context->log("  Vector size:         %u\n", vector_size);
                        context->log("  Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
                        context->log("  Cost:         %8.3g\n", cost);
 #endif
                        if (cost < minimum_cost)
                        {
 #if 1
                            context->log("  New optimal solution! (%g -> %g)\n", minimum_cost, cost);
 #endif
                            best_perf = perf;
                            minimum_cost = cost;
                        }
                    }
 #ifdef GLFFT_CLI_ASYNC
                    catch (const AsyncCancellation &)
                    {
                        throw;
                    }
 #endif
                    catch (...)
                    {
                        // If we pass in bogus parameters,
                        // FFT will throw and we just ignore this.
                    }
                }
            }
        }
    }
    context->log("Tested %u variants!\n", bench_count);
    return make_pair(minimum_cost, best_perf);
 }
 const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
        Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const
 {
    WisdomPass pass = {
        {
            Nx, Ny, radix, mode, input_target, output_target,
            type,
        },
        0.0,
    };
    auto itr = library.find(pass);
    return itr != end(library) ? (&(*itr)) : nullptr;
 }
 const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
        Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
 {
    WisdomPass pass = {
        {
            Nx, Ny, radix, mode, input_target, output_target,
            base_options.type,
        },
        0.0,
    };
    auto itr = library.find(pass);
 #if 0
    if (itr == end(library))
    {
        context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
                Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));
    }
 #endif
    return itr != end(library) ? itr->second : base_options.performance;
 }
 #ifdef GLFFT_SERIALIZATION
 std::string FFTWisdom::archive() const
 {
    StringBuffer s;
    PrettyWriter<StringBuffer> writer{s};
    writer.StartObject();
    writer.String("library");
    // Serialize all wisdom accumulated to a string.
    writer.StartArray();
    for (auto &entry : library)
    {
        writer.StartObject();
        writer.String("scenario");
        writer.StartObject();
        writer.String("nx");
        writer.Uint(entry.first.pass.Nx);
        writer.String("ny");
        writer.Uint(entry.first.pass.Ny);
        writer.String("radix");
        writer.Uint(entry.first.pass.radix);
        writer.String("mode");
        writer.Uint(entry.first.pass.mode);
        writer.String("input_target");
        writer.Uint(entry.first.pass.input_target);
        writer.String("output_target");
        writer.Uint(entry.first.pass.output_target);
        writer.EndObject();
        writer.String("type");
        writer.StartObject();
        writer.String("fp16");
        writer.Bool(entry.first.pass.type.fp16);
        writer.String("input_fp16");
        writer.Bool(entry.first.pass.type.input_fp16);
        writer.String("output_fp16");
        writer.Bool(entry.first.pass.type.output_fp16);
        writer.String("normalize");
        writer.Bool(entry.first.pass.type.normalize);
        writer.EndObject();
        writer.String("performance");
        writer.StartObject();
        writer.String("shared_banked");
        writer.Bool(entry.second.shared_banked);
        writer.String("vector_size");
        writer.Uint(entry.second.vector_size);
        writer.String("workgroup_size_x");
        writer.Uint(entry.second.workgroup_size_x);
        writer.String("workgroup_size_y");
        writer.Uint(entry.second.workgroup_size_y);
        writer.EndObject();
        writer.String("cost");
        writer.Double(entry.first.cost);
        writer.EndObject();
    }
    writer.EndArray();
    writer.EndObject();
    return s.GetString();
 }
 void FFTWisdom::extract(const char *json)
 {
    Document document;
    document.Parse(json);
    // Exception safe, we don't want to risk throwing in the middle of the
    // loop, leaving the library is broken state.
    unordered_map<WisdomPass, FFTOptions::Performance> new_library;
    auto &lib = document["library"];
    // y u no begin(), end() :(
    for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr)
    {
        auto &v = *itr;
        WisdomPass pass;
        FFTOptions::Performance perf;
        pass.cost = v["cost"].GetDouble();
        auto &scenario = v["scenario"];
        pass.pass.Nx = scenario["nx"].GetUint();
        pass.pass.Ny = scenario["ny"].GetUint();
        pass.pass.radix = scenario["radix"].GetUint();
        pass.pass.mode = static_cast<Mode>(scenario["mode"].GetUint());
        pass.pass.input_target = static_cast<Target>(scenario["input_target"].GetUint());
        pass.pass.output_target = static_cast<Target>(scenario["output_target"].GetUint());
        auto &type = v["type"];
        pass.pass.type.fp16 = type["fp16"].GetBool();
        pass.pass.type.input_fp16 = type["input_fp16"].GetBool();
        pass.pass.type.output_fp16 = type["output_fp16"].GetBool();
        pass.pass.type.normalize = type["normalize"].GetBool();
        auto &performance = v["performance"];
        perf.shared_banked = performance["shared_banked"].GetBool();
        perf.vector_size = performance["vector_size"].GetUint();
        perf.workgroup_size_x = performance["workgroup_size_x"].GetUint();
        perf.workgroup_size_y = performance["workgroup_size_y"].GetUint();
        new_library[pass] = perf;
    }
    // Exception safe.
    swap(library, new_library);
 }
 #endif
--- a/glfft/glfft_wisdom.hpp
+++ b/glfft/glfft_wisdom.hpp
@@ -0,0 +1,149 @@
 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef GLFFT_WISDOM_HPP__
 #define GLFFT_WISDOM_HPP__
 #include <unordered_map>
 #include <utility>
 #include <string>
 #include "glfft_common.hpp"
 #include "glfft_interface.hpp"
 namespace GLFFT
 {
 struct WisdomPass
 {
    struct
    {
        unsigned Nx;
        unsigned Ny;
        unsigned radix;
        Mode mode;
        Target input_target;
        Target output_target;
        FFTOptions::Type type;
    } pass;
    double cost;
    bool operator==(const WisdomPass &other) const
    {
        return std::memcmp(&pass, &other.pass, sizeof(pass)) == 0;
    }
 };
 }
 namespace std
 {
    template<>
    struct hash<GLFFT::WisdomPass>
    {
        std::size_t operator()(const GLFFT::WisdomPass &params) const
        {
            std::size_t h = 0;
            hash<uint8_t> hasher;
            for (std::size_t i = 0; i < sizeof(params.pass); i++)
            {
                h ^= hasher(reinterpret_cast<const uint8_t*>(&params.pass)[i]);
            }
            return h;
        }
    };
 }
 namespace GLFFT
 {
 // Adds information which depends on the GPU vendor.
 // This can speed up learning process, since there will be fewer "obviously wrong" settings to test.
 struct FFTStaticWisdom
 {
    enum Tristate { True = 1, False = 0, DontCare = -1 };
    unsigned min_workgroup_size = 1;
    unsigned min_workgroup_size_shared = 1;
    unsigned max_workgroup_size = 128; // GLES 3.1 mandates support for this.
    unsigned min_vector_size = 2;
    unsigned max_vector_size = 4;
    Tristate shared_banked = DontCare;
 };
 class FFTWisdom
 {
    public:
        std::pair<double, FFTOptions::Performance> learn_optimal_options(Context *ctx,
                unsigned Nx, unsigned Ny, unsigned radix,
                Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type);
        void learn_optimal_options_exhaustive(Context *ctx,
                unsigned Nx, unsigned Ny,
                Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type);
        const std::pair<const WisdomPass, FFTOptions::Performance>* find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
                Mode mode, Target input_target, Target output_target, const FFTOptions::Type &base_options) const;
        const FFTOptions::Performance& find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
                Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const;
        void set_static_wisdom(FFTStaticWisdom static_wisdom) { this->static_wisdom = static_wisdom; }
        static FFTStaticWisdom get_static_wisdom_from_renderer(Context *context);
        void set_bench_params(unsigned warmup,
                unsigned iterations, unsigned dispatches, double timeout)
        {
            params.warmup = warmup;
            params.iterations = iterations;
            params.dispatches = dispatches;
            params.timeout = timeout;
        }
 #ifdef GLFFT_SERIALIZATION
        // Serialization interface.
        std::string archive() const;
        void extract(const char *json);
 #endif
    private:
        std::unordered_map<WisdomPass, FFTOptions::Performance> library;
        std::pair<double, FFTOptions::Performance> study(Context *context,
                const WisdomPass &pass, FFTOptions::Type options) const;
        double bench(Context *cmd, Resource *output, Resource *input,
                const WisdomPass &pass, const FFTOptions &options,
                const std::shared_ptr<ProgramCache> &cache) const;
        FFTStaticWisdom static_wisdom;
        struct
        {
            unsigned warmup = 2;
            unsigned iterations = 20;
            unsigned dispatches = 50;
            double timeout = 1.0;
        } params;
 };
 }
 #endif