Add GLFFT

2019-09-11 23:06:03 -07:00
parent 580e296e86
commit b26f11ae20
10 changed files with 3001 additions and 0 deletions
--- a/glfft/LICENSE_ORIGINAL
+++ b/glfft/LICENSE_ORIGINAL
@@ -0,0 +1,19 @@
+Copyright (c) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/glfft/glfft.cpp
+++ b/glfft/glfft.cpp
--- a/glfft/glfft.hpp
+++ b/glfft/glfft.hpp
@@ -0,0 +1,225 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_HPP__
+#define GLFFT_HPP__
+
+#include "glfft_interface.hpp"
+#include "glfft_common.hpp"
+#include "glfft_wisdom.hpp"
+#include <vector>
+#include <unordered_map>
+#include <limits>
+
+/// GLFFT doesn't try to preserve GL state in any way.
+/// E.g. SHADER_STORAGE_BUFFER bindings, programs bound, texture bindings, etc.
+/// Applications calling this library must expect that some GL state will be modified.
+/// No rendering state associated with graphics will be modified.
+
+namespace GLFFT
+{
+
+class FFT
+{
+    public:
+        /// @brief Creates a full FFT.
+        ///
+        /// All buffer allocation done by GLFFT will be done in constructor.
+        /// Will throw if invalid parameters are passed.
+        ///
+        /// @param context       The graphics context.
+        /// @param Nx            Number of samples in horizontal dimension.
+        /// @param Ny            Number of samples in vertical dimension.
+        /// @param type          The transform type.
+        /// @param direction     Forward, inverse or inverse with convolution.
+        ///                      For real-to-complex and complex-to-real transforms, the
+        ///                      transform type must match.
+        /// @param input_target  GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
+        /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
+        /// @param cache         A program cache for caching the GLFFT programs created.
+        /// @param options       FFT options such as performance related parameters and types.
+        /// @param wisdom        GLFFT wisdom which can override performance related options
+        ///                      (options.performance is used as a fallback).
+        FFT(Context *context, unsigned Nx, unsigned Ny,
+                Type type, Direction direction, Target input_target, Target output_target,
+                std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
+                const FFTWisdom &wisdom = FFTWisdom());
+
+        /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
+        ///
+        /// All buffer allocation done by GLFFT will be done in constructor.
+        /// Will throw if invalid parameters are passed.
+        ///
+        /// @param context       The graphics context.
+        /// @param Nx            Number of samples in horizontal dimension.
+        /// @param Ny            Number of samples in vertical dimension.
+        /// @param radix         FFT radix to test.
+        /// @param p             Accumulated p factor. If 1, "first pass" mode is tested, otherwise, generic FFT stages.
+        /// @param mode          The transform mode.
+        /// @param input_target  GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
+        /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
+        /// @param cache         A program cache for caching the GLFFT programs created.
+        /// @param options       FFT options such as performance related parameters and types.
+        FFT(Context *context, unsigned Nx, unsigned Ny, unsigned radix, unsigned p,
+                Mode mode, Target input_target, Target output_target,
+                std::shared_ptr<ProgramCache> cache, const FFTOptions &options);
+
+        /// @brief Process the FFT.
+        ///
+        /// The type of object passed here must match what FFT was initialized with.
+        ///
+        /// @param cmd       Command buffer for issuing dispatch commands.
+        /// @param output    Output buffer or image.
+        ///                  NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
+        /// @param input     Input buffer or texture.
+        /// @param input_aux If using convolution transform type,
+        ///                  the content of input and input_aux will be multiplied together.
+        void process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux = nullptr);
+
+        /// @brief Run process() multiple times, timing the results.
+        ///
+        /// Mostly used internally by GLFFT wisdom, glfft_cli's bench, and so on.
+        ///
+        /// @param context                  The graphics context.
+        /// @param output                   Output buffer or image.
+        ///                                 NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
+        /// @param input                    Input buffer or texture.
+        /// @param warmup_iterations        Number of iterations to run to "warm" up GL, ensures we don't hit
+        ///                                 recompilations or similar when benching.
+        /// @param iterations               Number of iterations to run the benchmark.
+        ///                                 Each iteration will ensure timing with a glFinish() followed by timing.
+        /// @param dispatches_per_iteration Number of calls to process() we should do per iteration.
+        /// @param max_time                 The max time the benchmark should run. Will be checked after each iteration is complete.
+        ///
+        /// @returns Average GPU time per process() call.
+        double bench(Context *context, Resource *output, Resource *input,
+                unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration,
+                double max_time = std::numeric_limits<double>::max());
+
+        /// @brief Returns cost for a process() call. Only used for debugging.
+        double get_cost() const { return cost; }
+
+        /// @brief Returns number of passes (glDispatchCompute) in a process() call.
+        unsigned get_num_passes() const { return passes.size(); }
+
+        /// @brief Returns Nx.
+        unsigned get_dimension_x() const { return size_x; }
+        /// @brief Returns Ny.
+        unsigned get_dimension_y() const { return size_y; }
+
+        /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures.
+        ///
+        /// By default, these values are 0.5 / size (samples in the center of texel (0, 0)).
+        /// Scale is 1.0 / size, so it steps one texel for each coordinate in the FFT transform.
+        /// Setting this to something custom is useful to get downsampling with GL_LINEAR -> FFT transform
+        /// without having to downsample the texture first, then FFT.
+        void set_texture_offset_scale(float offset_x, float offset_y, float scale_x, float scale_y)
+        {
+            texture.offset_x = offset_x;
+            texture.offset_y = offset_y;
+            texture.scale_x = scale_x;
+            texture.scale_y = scale_y;
+        }
+
+        /// @brief Set binding range for input.
+        ///
+        /// If input is an SSBO, set a custom binding range to be passed to glBindBufferRange.
+        /// By default, the entire buffer is bound.
+        void set_input_buffer_range(size_t offset, size_t size)
+        {
+            ssbo.input.offset = offset;
+            ssbo.input.size = size;
+        }
+
+        /// @brief Set binding range for input_aux.
+        ///
+        /// If input_aux is an SSBO, set a custom binding range to be passed to glBindBufferRange.
+        /// By default, the entire buffer is bound.
+        void set_input_aux_buffer_range(size_t offset, size_t size)
+        {
+            ssbo.input_aux.offset = offset;
+            ssbo.input_aux.size = size;
+        }
+
+        /// @brief Set binding range for output.
+        ///
+        /// If output buffer is an SSBO, set a custom binding range to be passed to glBindBufferRange.
+        /// By default, the entire buffer is bound.
+        void set_output_buffer_range(size_t offset, size_t size)
+        {
+            ssbo.output.offset = offset;
+            ssbo.output.size = size;
+        }
+
+        /// @brief Set samplers for input textures.
+        ///
+        /// Set sampler objects to be used for input and input_aux if textures are used as input.
+        /// By default, sampler object 0 will be used (inheriting sampler parameters from the texture object itself).
+        void set_samplers(Sampler *sampler0, Sampler *sampler1 = nullptr)
+        {
+            texture.samplers[0] = sampler0;
+            texture.samplers[1] = sampler1;
+        }
+
+    private:
+        Context *context;
+
+        struct Pass
+        {
+            Parameters parameters;
+
+            unsigned workgroups_x;
+            unsigned workgroups_y;
+            unsigned uv_scale_x;
+            unsigned stride;
+            Program *program;
+        };
+
+        double cost = 0.0;
+
+        std::unique_ptr<Buffer> temp_buffer;
+        std::unique_ptr<Buffer> temp_buffer_image;
+        std::vector<Pass> passes;
+        std::shared_ptr<ProgramCache> cache;
+
+        std::unique_ptr<Program> build_program(const Parameters &params);
+        static std::string load_shader_string(const char *path);
+        static void store_shader_string(const char *path, const std::string &source);
+
+        Program* get_program(const Parameters &params);
+
+        struct
+        {
+            float offset_x = 0.0f, offset_y = 0.0f, scale_x = 1.0f, scale_y = 1.0f;
+            Sampler *samplers[2] = { nullptr, nullptr };
+        } texture;
+
+        struct
+        {
+            struct
+            {
+                size_t offset = 0;
+                size_t size = 0;
+            } input, input_aux, output;
+        } ssbo;
+        unsigned size_x, size_y;
+};
+
+}
+
+#endif
--- a/glfft/glfft_common.hpp
+++ b/glfft/glfft_common.hpp
@@ -0,0 +1,178 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// For the most part used by the implementation.
+
+#ifndef GLFFT_COMMON_HPP__
+#define GLFFT_COMMON_HPP__
+
+#include "glfft_interface.hpp"
+#include <functional>
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
+
+namespace GLFFT
+{
+
+enum Direction
+{
+    /// Forward FFT transform.
+    Forward = -1,
+    /// Inverse FFT transform, but with two inputs (in frequency domain) which are multiplied together
+    /// for convolution.
+    InverseConvolve = 0,
+    /// Inverse FFT transform.
+    Inverse = 1
+};
+
+enum Mode
+{
+    Horizontal,
+    HorizontalDual,
+    Vertical,
+    VerticalDual,
+
+    ResolveRealToComplex,
+    ResolveComplexToReal,
+};
+
+enum Type
+{
+    /// Regular complex-to-complex transform.
+    ComplexToComplex,
+    /// Complex-to-complex dual transform where the complex value is four-dimensional,
+    /// i.e. a vector of two complex values. Typically used to transform RGBA data.
+    ComplexToComplexDual,
+    /// Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex samples.
+    ComplexToReal,
+    /// Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N complex samples.
+    RealToComplex
+};
+
+enum Target
+{
+    /// GL_SHADER_STORAGE_BUFFER
+    SSBO,
+    /// Textures, when used as output, type is determined by transform type.
+    /// ComplexToComplex / RealToComplex -> GL_RG16F
+    /// ComplexToComplexDual -> GL_RGBA16F
+    Image,
+    /// Real-valued (single component) textures, when used as output, type is determined by transform type.
+    /// ComplexToReal -> GL_R32F (because GLES 3.1 doesn't have GL_R16F image type).
+    ImageReal
+};
+
+struct Parameters
+{
+    unsigned workgroup_size_x;
+    unsigned workgroup_size_y;
+    unsigned workgroup_size_z;
+    unsigned radix;
+    unsigned vector_size;
+    Direction direction;
+    Mode mode;
+    Target input_target;
+    Target output_target;
+    bool p1;
+    bool shared_banked;
+    bool fft_fp16, input_fp16, output_fp16;
+    bool fft_normalize;
+
+    bool operator==(const Parameters &other) const
+    {
+        return std::memcmp(this, &other, sizeof(Parameters)) == 0;
+    }
+};
+
+/// @brief Options for FFT implementation.
+/// Defaults for performance as conservative.
+struct FFTOptions
+{
+    struct Performance
+    {
+        /// Workgroup size used in layout(local_size_x).
+        /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
+        /// FFT constructor will throw in this case.
+        unsigned workgroup_size_x = 4;
+        /// Workgroup size used in layout(local_size_x).
+        /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
+        /// FFT constructor will throw in this case.
+        unsigned workgroup_size_y = 1;
+        /// Vector size. Very GPU dependent. "Scalar" GPUs prefer 2 here, vector GPUs prefer 4 (and maybe 8).
+        unsigned vector_size = 2;
+        /// Whether to use banked shared memory or not.
+        /// Desktop GPUs prefer true here, false for mobile in general.
+        bool shared_banked = false;
+    } performance;
+
+    struct Type
+    {
+        /// Whether internal shader should be mediump float.
+        bool fp16 = false;
+        /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
+        bool input_fp16 = false;
+        /// Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
+        bool output_fp16 = false;
+        /// Whether to apply 1 / N normalization factor.
+        bool normalize = false;
+    } type;
+};
+
+}
+
+namespace std
+{
+    template<>
+    struct hash<GLFFT::Parameters>
+    {
+        std::size_t operator()(const GLFFT::Parameters &params) const
+        {
+            std::size_t h = 0;
+            hash<uint8_t> hasher;
+            for (std::size_t i = 0; i < sizeof(GLFFT::Parameters); i++)
+            {
+                h ^= hasher(reinterpret_cast<const uint8_t*>(&params)[i]);
+            }
+
+            return h;
+        }
+    };
+}
+
+namespace GLFFT
+{
+
+class ProgramCache
+{
+    public:
+        Program* find_program(const Parameters &parameters) const;
+        void insert_program(const Parameters &parameters, std::unique_ptr<Program> program);
+        size_t cache_size() const { return programs.size(); }
+
+    private:
+        std::unordered_map<Parameters, std::unique_ptr<Program>> programs;
+};
+
+}
+
+#endif
+
--- a/glfft/glfft_gl_api_headers.hpp
+++ b/glfft/glfft_gl_api_headers.hpp
@@ -0,0 +1,6 @@
+
+/* Let GLFFT use GLava's headers */
+#define GLFFT_GLSL_LANG_STRING "#version 430 core\n"
+extern "C" {
+    #include "../glava/glad.h"
+}
--- a/glfft/glfft_gl_interface.cpp
+++ b/glfft/glfft_gl_interface.cpp
@@ -0,0 +1,310 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "glfft_gl_interface.hpp"
+#ifdef GLFFT_GL_DEBUG
+#include "glfft_validate.hpp"
+#endif
+#include <cstdarg>
+#include <cstring>
+#include <vector>
+
+using namespace GLFFT;
+using namespace std;
+
+GLCommandBuffer GLContext::static_command_buffer;
+
+void GLCommandBuffer::bind_program(Program *program)
+{
+    glUseProgram(program ? static_cast<GLProgram*>(program)->name : 0);
+}
+
+void GLCommandBuffer::bind_storage_texture(unsigned binding, Texture *texture, Format format)
+{
+    glBindImageTexture(binding, static_cast<GLTexture*>(texture)->name,
+            0, GL_FALSE, 0, GL_WRITE_ONLY, convert(format));
+}
+
+void GLCommandBuffer::bind_texture(unsigned binding, Texture *texture)
+{
+    glActiveTexture(GL_TEXTURE0 + binding);
+    glBindTexture(GL_TEXTURE_2D, static_cast<GLTexture*>(texture)->name);
+}
+
+void GLCommandBuffer::bind_sampler(unsigned binding, Sampler *sampler)
+{
+    glBindSampler(binding, sampler ? static_cast<GLSampler*>(sampler)->name : 0);
+}
+
+void GLCommandBuffer::bind_storage_buffer(unsigned binding, Buffer *buffer)
+{
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name);
+}
+
+void GLCommandBuffer::bind_storage_buffer_range(unsigned binding, size_t offset, size_t size, Buffer *buffer)
+{
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name, offset, size);
+}
+
+void GLCommandBuffer::dispatch(unsigned x, unsigned y, unsigned z)
+{
+    glDispatchCompute(x, y, z);
+}
+
+void GLCommandBuffer::barrier(Buffer*)
+{
+    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+}
+
+void GLCommandBuffer::barrier(Texture*)
+{
+    glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
+}
+
+void GLCommandBuffer::barrier()
+{
+    glMemoryBarrier(GL_ALL_BARRIER_BITS);
+}
+
+void GLCommandBuffer::push_constant_data(unsigned binding, const void *data, size_t size)
+{
+    glBindBufferBase(GL_UNIFORM_BUFFER, binding, ubos[ubo_index]);
+    void *ptr = glMapBufferRange(GL_UNIFORM_BUFFER,
+            0, CommandBuffer::MaxConstantDataSize,
+            GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+
+    if (ptr)
+    {
+        std::memcpy(ptr, data, size);
+        glUnmapBuffer(GL_UNIFORM_BUFFER);
+    }
+
+    if (++ubo_index >= ubo_count)
+        ubo_index = 0;
+}
+
+CommandBuffer* GLContext::request_command_buffer()
+{
+    if (!initialized_ubos)
+    {
+        glGenBuffers(MaxBuffersRing, ubos);
+        for (auto &ubo : ubos)
+        {
+            glBindBuffer(GL_UNIFORM_BUFFER, ubo);
+            glBufferData(GL_UNIFORM_BUFFER, CommandBuffer::MaxConstantDataSize, nullptr, GL_STREAM_DRAW);
+        }
+        static_command_buffer.set_constant_data_buffers(ubos, MaxBuffersRing);
+        initialized_ubos = true;
+    }
+    return &static_command_buffer;
+}
+
+void GLContext::submit_command_buffer(CommandBuffer*)
+{}
+
+void GLContext::wait_idle()
+{
+    glFinish();
+}
+
+unique_ptr<Texture> GLContext::create_texture(const void *initial_data,
+        unsigned width, unsigned height,
+        Format format)
+{
+    return unique_ptr<Texture>(new GLTexture(initial_data, width, height, format));
+}
+
+unique_ptr<Buffer> GLContext::create_buffer(const void *initial_data, size_t size, AccessMode access)
+{
+    return unique_ptr<Buffer>(new GLBuffer(initial_data, size, access));
+}
+
+unique_ptr<Program> GLContext::compile_compute_shader(const char *source)
+{
+#ifdef GLFFT_GL_DEBUG
+    if (!validate_glsl_source(source))
+        return nullptr;
+#endif
+
+    GLuint program = glCreateProgram();
+    if (!program)
+    {
+        return nullptr;
+    }
+
+    GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
+
+    const char *sources[] = { GLFFT_GLSL_LANG_STRING, source };
+    glShaderSource(shader, 2, sources, NULL);
+    glCompileShader(shader);
+
+    GLint status;
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
+    if (status == GL_FALSE)
+    {
+        GLint len;
+        GLsizei out_len;
+
+        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len);
+        vector<char> buf(len);
+        glGetShaderInfoLog(shader, len, &out_len, buf.data());
+        log("GLFFT: Shader log:\n%s\n\n", buf.data());
+
+        glDeleteShader(shader);
+        glDeleteProgram(program);
+        return 0;
+    }
+
+    glAttachShader(program, shader);
+    glLinkProgram(program);
+    glDeleteShader(shader);
+
+    glGetProgramiv(program, GL_LINK_STATUS, &status);
+    if (status == GL_FALSE)
+    {
+        GLint len;
+        GLsizei out_len;
+        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &len);
+        vector<char> buf(len);
+        glGetProgramInfoLog(program, len, &out_len, buf.data());
+        log("Program log:\n%s\n\n", buf.data());
+
+        glDeleteProgram(program);
+        glDeleteShader(shader);
+        return nullptr;
+    }
+
+    return unique_ptr<Program>(new GLProgram(program));
+}
+
+void GLContext::log(const char *fmt, ...)
+{
+    char buffer[4 * 1024];
+
+    va_list va;
+    va_start(va, fmt);
+    vsnprintf(buffer, sizeof(buffer), fmt, va);
+    va_end(va);
+    glfft_log("%s", buffer);
+}
+
+double GLContext::get_time()
+{
+    return glfft_time();
+}
+
+unsigned GLContext::get_max_work_group_threads()
+{
+    GLint value;
+    glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &value);
+    return value;
+}
+
+const char* GLContext::get_renderer_string()
+{
+    return reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+}
+
+const void* GLContext::map(Buffer *buffer, size_t offset, size_t size)
+{
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
+    const void *ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, size, GL_MAP_READ_BIT);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+    return ptr;
+}
+
+void GLContext::unmap(Buffer *buffer)
+{
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
+    glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+}
+
+void GLContext::teardown()
+{
+    if (initialized_ubos)
+        glDeleteBuffers(MaxBuffersRing, ubos);
+    initialized_ubos = false;
+}
+
+GLContext::~GLContext()
+{
+    teardown();
+}
+
+GLTexture::GLTexture(const void *initial_data,
+        unsigned width, unsigned height,
+        Format format)
+{
+    glGenTextures(1, &name);
+    glBindTexture(GL_TEXTURE_2D, name);
+    glTexStorage2D(GL_TEXTURE_2D, 1, convert(format), width, height);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+    if (initial_data)
+    {
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height,
+                convert_format(format), convert_type(format), initial_data);
+    }
+
+    glBindTexture(GL_TEXTURE_2D, 0);
+}
+
+GLTexture::~GLTexture()
+{
+    if (owned)
+        glDeleteTextures(1, &name);
+}
+
+GLBuffer::GLBuffer(const void *initial_data, size_t size, AccessMode access)
+{
+    glGenBuffers(1, &name);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, name);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, size, initial_data, convert(access));
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+}
+
+GLBuffer::~GLBuffer()
+{
+    if (owned)
+        glDeleteBuffers(1, &name);
+}
+
+GLProgram::GLProgram(GLuint name)
+    : name(name)
+{}
+
+GLProgram::~GLProgram()
+{
+    if (name != 0)
+    {
+        glDeleteProgram(name);
+    }
+}
+
+GLSampler::~GLSampler()
+{
+    if (name != 0)
+    {
+        glDeleteSamplers(1, &name);
+    }
+}
+
--- a/glfft/glfft_gl_interface.hpp
+++ b/glfft/glfft_gl_interface.hpp
@@ -0,0 +1,258 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_GL_INTERFACE_HPP__
+#define GLFFT_GL_INTERFACE_HPP__
+
+#include "glfft_interface.hpp"
+
+#include "glfft_gl_api_headers.hpp"
+
+/* GLava additions (POSIX) */
+extern "C" {
+    #include <time.h>
+    #include <stdarg.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <error.h>
+}
+
+#ifndef GLFFT_GLSL_LANG_STRING
+#error GLFFT_GLSL_LANG_STRING must be defined to e.g. "#version 310 es\n" or "#version 430 core\n".
+#endif
+
+#ifndef GLFFT_LOG_OVERRIDE
+void glfft_log(const char *fmt, ...) {
+    va_list l;
+    va_start(l, fmt);
+    vfprintf(stdout, fmt, l);
+    va_end(l);
+}
+#else
+#define glfft_log GLFFT_LOG_OVERRIDE
+#endif
+
+#ifndef GLFFT_TIME_OVERRIDE
+double glfft_time() {
+    struct timespec tv;
+    if (clock_gettime(CLOCK_REALTIME, &tv)) {
+        fprintf(stderr, "clock_gettime(CLOCK_REALTIME, ...): %s\n", strerror(errno));
+    }
+    return (double) tv.tv_sec + ((double) tv.tv_nsec / 1000000000.0);
+}
+#else
+#define glfft_time GLFFT_TIME_OVERRIDE
+#endif
+
+namespace GLFFT
+{
+    class GLContext;
+
+    class GLTexture : public Texture
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLTexture();
+
+            GLTexture(GLuint obj) : name(obj), owned(false) {}
+            GLuint get() const { return name; }
+
+        private:
+            GLTexture(const void *initial_data,
+                    unsigned width, unsigned height,
+                    Format format);
+            GLuint name;
+            bool owned = true;
+    };
+
+    // Not really used by test and bench code, but can be useful for API users.
+    class GLSampler : public Sampler
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLSampler();
+
+            GLSampler(GLuint obj) : name(obj) {}
+            GLuint get() const { return name; }
+
+        private:
+            GLuint name;
+    };
+
+    class GLBuffer : public Buffer
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLBuffer();
+
+            GLBuffer(GLuint obj) : name(obj), owned(false) {}
+            GLuint get() const { return name; }
+
+        private:
+            GLuint name;
+            GLBuffer(const void *initial_data, size_t size, AccessMode access);
+            bool owned = true;
+    };
+
+    class GLProgram : public Program
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLProgram();
+
+            GLuint get() const { return name; }
+
+        private:
+            GLProgram(GLuint name);
+            GLuint name;
+    };
+
+    class GLCommandBuffer : public CommandBuffer
+    {
+        public:
+            ~GLCommandBuffer() = default;
+
+            void set_constant_data_buffers(const GLuint *ubos, unsigned count)
+            {
+                this->ubos = ubos;
+                ubo_index = 0;
+                ubo_count = count;
+            }
+
+            void bind_program(Program *program) override;
+            void bind_storage_texture(unsigned binding, Texture *texture, Format format) override;
+            void bind_texture(unsigned binding, Texture *texture) override;
+            void bind_sampler(unsigned binding, Sampler *sampler) override;
+            void bind_storage_buffer(unsigned binding, Buffer *texture) override;
+            void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) override;
+            void dispatch(unsigned x, unsigned y, unsigned z) override;
+
+            void barrier(Buffer *buffer) override;
+            void barrier(Texture *buffer) override;
+            void barrier() override;
+
+            void push_constant_data(unsigned binding, const void *data, size_t size) override;
+
+        private:
+            const GLuint *ubos = nullptr;
+            unsigned ubo_count = 0;
+            unsigned ubo_index = 0;
+    };
+
+    class GLContext : public Context
+    {
+        public:
+            ~GLContext();
+
+            std::unique_ptr<Texture> create_texture(const void *initial_data,
+                    unsigned width, unsigned height,
+                    Format format) override;
+
+            std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) override;
+            std::unique_ptr<Program> compile_compute_shader(const char *source) override;
+
+            CommandBuffer* request_command_buffer() override;
+            void submit_command_buffer(CommandBuffer *cmd) override;
+            void wait_idle() override;
+
+            const char* get_renderer_string() override;
+            void log(const char *fmt, ...) override;
+            double get_time() override;
+
+            unsigned get_max_work_group_threads() override;
+
+            const void* map(Buffer *buffer, size_t offset, size_t size) override;
+            void unmap(Buffer *buffer) override;
+
+            // Not supported in GLES, so override when creating platform-specific context.
+            bool supports_texture_readback() override { return false; }
+            void read_texture(void*, Texture*, Format) override {}
+
+        protected:
+            void teardown();
+
+        private:
+            static GLCommandBuffer static_command_buffer;
+
+            enum { MaxBuffersRing = 256 };
+            GLuint ubos[MaxBuffersRing];
+            bool initialized_ubos = false;
+    };
+
+    static inline GLenum convert(AccessMode mode)
+    {
+        switch (mode)
+        {
+            case AccessStreamCopy: return GL_STREAM_COPY;
+            case AccessStaticCopy: return GL_STATIC_COPY;
+            case AccessStreamRead: return GL_STREAM_READ;
+        }
+        return 0;
+    }
+
+    static inline GLenum convert(Format format)
+    {
+        switch (format)
+        {
+            case FormatR16G16B16A16Float: return GL_RGBA16F;
+            case FormatR32G32B32A32Float: return GL_RGBA32F;
+            case FormatR32Float: return GL_R32F;
+            case FormatR16G16Float: return GL_RG16F;
+            case FormatR32G32Float: return GL_RG32F;
+            case FormatR32Uint: return GL_R32UI;
+            case FormatUnknown: return 0;
+        }
+        return 0;
+    }
+
+    static inline GLenum convert_format(Format format)
+    {
+        switch (format)
+        {
+            case FormatR16G16Float: return GL_RG;
+            case FormatR32G32Float: return GL_RG;
+            case FormatR16G16B16A16Float: return GL_RGBA;
+            case FormatR32G32B32A32Float: return GL_RGBA;
+            case FormatR32Float: return GL_RED;
+            case FormatR32Uint: return GL_RED_INTEGER;
+            case FormatUnknown: return 0;
+        }
+        return 0;
+    }
+
+    static inline GLenum convert_type(Format format)
+    {
+        switch (format)
+        {
+            case FormatR16G16Float: return GL_HALF_FLOAT;
+            case FormatR16G16B16A16Float: return GL_HALF_FLOAT;
+            case FormatR32Float: return GL_FLOAT;
+            case FormatR32G32Float: return GL_FLOAT;
+            case FormatR32G32B32A32Float: return GL_FLOAT;
+            case FormatR32Uint: return GL_UNSIGNED_INT;
+            case FormatUnknown: return 0;
+        }
+        return 0;
+    }
+}
+
+#endif
--- a/glfft/glfft_interface.hpp
+++ b/glfft/glfft_interface.hpp
@@ -0,0 +1,131 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_INTERFACE_HPP__
+#define GLFFT_INTERFACE_HPP__
+
+#include <memory>
+
+namespace GLFFT
+{
+    class Context;
+
+    class Resource
+    {
+        public:
+            virtual ~Resource() = default;
+
+            // Non-movable, non-copyable to make things simpler.
+            Resource(Resource&&) = delete;
+            void operator=(const Resource&) = delete;
+
+        protected:
+            Resource() = default;
+    };
+
+    class Texture : public Resource {};
+    class Sampler : public Resource {};
+    class Buffer : public Resource {};
+
+    class Program
+    {
+        public:
+            virtual ~Program() = default;
+        protected:
+            friend class Context;
+            Program() = default;
+    };
+
+    enum AccessMode
+    {
+        AccessStreamCopy,
+        AccessStaticCopy,
+        AccessStreamRead
+    };
+
+    enum Format
+    {
+        FormatUnknown,
+        FormatR16G16B16A16Float,
+        FormatR32G32B32A32Float,
+        FormatR32G32Float,
+        FormatR32Float,
+        FormatR16G16Float,
+        FormatR32Uint
+    };
+
+    class CommandBuffer;
+
+    class Context
+    {
+        public:
+            virtual ~Context() = default;
+
+            virtual std::unique_ptr<Texture> create_texture(const void *initial_data,
+                    unsigned width, unsigned height,
+                    Format format) = 0;
+
+            virtual std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) = 0;
+            virtual std::unique_ptr<Program> compile_compute_shader(const char *source) = 0;
+
+            virtual CommandBuffer* request_command_buffer() = 0;
+            virtual void submit_command_buffer(CommandBuffer *cmd) = 0;
+            virtual void wait_idle() = 0;
+
+            virtual const char* get_renderer_string() = 0;
+            virtual void log(const char *fmt, ...) = 0;
+            virtual double get_time() = 0;
+
+            virtual unsigned get_max_work_group_threads() = 0;
+
+            virtual const void* map(Buffer *buffer, size_t offset, size_t size) = 0;
+            virtual void unmap(Buffer *buffer) = 0;
+
+            virtual bool supports_texture_readback() = 0;
+            virtual void read_texture(void *buffer, Texture *texture, Format format) = 0;
+
+        protected:
+            Context() = default;
+    };
+
+    class CommandBuffer
+    {
+        public:
+            virtual ~CommandBuffer() = default;
+
+            virtual void bind_program(Program *program) = 0;
+            virtual void bind_storage_texture(unsigned binding, Texture *texture, Format format) = 0;
+            virtual void bind_texture(unsigned binding, Texture *texture) = 0;
+            virtual void bind_sampler(unsigned binding, Sampler *sampler) = 0;
+            virtual void bind_storage_buffer(unsigned binding, Buffer *texture) = 0;
+            virtual void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) = 0;
+            virtual void dispatch(unsigned x, unsigned y, unsigned z) = 0;
+
+            virtual void barrier(Buffer *buffer) = 0;
+            virtual void barrier(Texture *buffer) = 0;
+            virtual void barrier() = 0;
+
+            enum { MaxConstantDataSize = 64 };
+            virtual void push_constant_data(unsigned binding, const void *data, size_t size) = 0;
+
+        protected:
+            CommandBuffer() = default;
+    };
+}
+
+#endif
--- a/glfft/glfft_wisdom.cpp
+++ b/glfft/glfft_wisdom.cpp
@@ -0,0 +1,600 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "glfft_wisdom.hpp"
+#include "glfft_interface.hpp"
+#include "glfft.hpp"
+#include <utility>
+
+/* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */
+#ifdef GLFFT_SERIALIZATION
+#include "rapidjson/reader.h"
+#include "rapidjson/prettywriter.h"
+#include "rapidjson/stringbuffer.h"
+#include "rapidjson/document.h"
+using namespace rapidjson;
+#endif
+
+#ifdef GLFFT_CLI_ASYNC
+#include "glfft_cli.hpp"
+#endif
+
+using namespace std;
+using namespace GLFFT;
+
+FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context)
+{
+    FFTStaticWisdom res;
+
+    const char *renderer = context->get_renderer_string();
+    unsigned threads = context->get_max_work_group_threads();
+
+    if (strstr(renderer, "GeForce") || strstr(renderer, "Quadro"))
+    {
+        context->log("Detected GeForce/Quadro GPU.\n");
+        res.min_workgroup_size = 32; // Warp threads.
+        res.min_workgroup_size_shared = 32;
+        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
+        res.min_vector_size = 2;
+        res.max_vector_size = 2;
+        res.shared_banked = FFTStaticWisdom::True;
+    }
+    else if (strstr(renderer, "Radeon"))
+    {
+        context->log("Detected Radeon GPU.\n");
+        res.min_workgroup_size = 64; // Wavefront threads (GCN).
+        res.min_workgroup_size_shared = 128;
+        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
+        // TODO: Find if we can restrict this to 2 or 4 always.
+        res.min_vector_size = 2;
+        res.max_vector_size = 4;
+        res.shared_banked = FFTStaticWisdom::True;
+    }
+    else if (strstr(renderer, "Mali"))
+    {
+        context->log("Detected Mali GPU.\n");
+
+        res.min_workgroup_size = 4;
+        res.min_workgroup_size_shared = 4;
+        res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.
+        res.min_vector_size = 4;
+        res.max_vector_size = 4;
+        res.shared_banked = FFTStaticWisdom::False;
+    }
+    // TODO: Add more GPUs.
+
+    return res;
+}
+
+pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(
+        Context *context, unsigned Nx, unsigned Ny, unsigned radix,
+        Mode mode, Target input_target, Target output_target,
+        const FFTOptions::Type &type)
+{
+    WisdomPass pass = {
+        {
+            Nx, Ny, radix, mode, input_target, output_target,
+            type,
+        },
+        0.0,
+    };
+
+    auto itr = library.find(pass);
+    if (itr != end(library))
+    {
+        return make_pair(itr->first.cost, itr->second);
+    }
+    else
+    {
+        auto result = study(context, pass, type);
+        pass.cost = result.first;
+        library[pass] = result.second;
+
+        return result;
+    }
+}
+
+void FFTWisdom::learn_optimal_options_exhaustive(Context *context,
+        unsigned Nx, unsigned Ny,
+        Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)
+{
+    bool learn_resolve = type == ComplexToReal || type == RealToComplex;
+    Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;
+    Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
+
+    // Create wisdom for horizontal transforms and vertical transform.
+    static const unsigned radices[] = { 4, 8, 16, 64 };
+    for (auto radix : radices)
+    {
+        try
+        {
+            // If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.
+
+            // Learn plain transforms.
+            if (Ny > 1)
+            {
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);
+            }
+            learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);
+
+            // Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with
+            // block interleave and larger WG_Y sizes.
+            if (input_target != SSBO)
+            {
+                if (Ny > 1)
+                {
+                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);
+                }
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);
+            }
+
+            if (output_target != SSBO)
+            {
+                if (Ny > 1)
+                {
+                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);
+                }
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);
+            }
+        }
+#ifdef GLFFT_CLI_ASYNC
+        catch (const AsyncCancellation &)
+        {
+            throw;
+        }
+#endif
+        catch (...)
+        {
+            // If our default options cannot successfully create the radix pass (i.e. throws),
+            // just ignore it for purpose of creating wisdom.
+        }
+    }
+
+    auto resolve_type = fft_type;
+    resolve_type.input_fp16 = resolve_type.output_fp16;
+    Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
+    Target resolve_input_target = SSBO;
+
+    // If we have C2R Nx1 transform, the first pass is resolve, so use those types.
+    if (type == ComplexToReal && Ny == 1)
+    {
+        resolve_type = fft_type;
+        resolve_input_target = input_target;
+    }
+
+    // If we need to do a resolve pass, train this case as well.
+    if (learn_resolve)
+    {
+        try
+        {
+            // If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.
+            if (Ny == 1 && resolve_mode == ResolveRealToComplex)
+            {
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
+            }
+            else
+            {
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);
+            }
+        }
+#ifdef GLFFT_CLI_ASYNC
+        catch (const AsyncCancellation &)
+        {
+            throw;
+        }
+#endif
+        catch (...)
+        {
+            // If our default options cannot successfully create the radix pass (i.e. throws),
+            // just ignore it for purpose of creating wisdom.
+        }
+    }
+}
+
+double FFTWisdom::bench(Context *context, Resource *output, Resource *input,
+        const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const
+{
+    FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,
+            pass.pass.mode, pass.pass.input_target, pass.pass.output_target,
+            cache, options);
+
+    return fft.bench(context,
+            output, input, params.warmup, params.iterations, params.dispatches, params.timeout);
+}
+
+static inline unsigned mode_to_size(Mode mode)
+{
+    switch (mode)
+    {
+        case VerticalDual:
+        case HorizontalDual:
+        case ResolveRealToComplex:
+        case ResolveComplexToReal:
+            return 4;
+
+        default:
+            return 2;
+    }
+}
+
+std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const
+{
+    auto cache = make_shared<ProgramCache>();
+
+    unique_ptr<Resource> output;
+    unique_ptr<Resource> input;
+
+    unsigned mode_size = mode_to_size(pass.pass.mode);
+    vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);
+
+    if (pass.pass.input_target == SSBO)
+    {
+        input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy);
+    }
+    else
+    {
+        Format format = FormatUnknown;
+        unsigned Nx = pass.pass.Nx;
+        unsigned Ny = pass.pass.Ny;
+
+        switch (pass.pass.mode)
+        {
+            case VerticalDual:
+            case HorizontalDual:
+                format = FormatR32G32B32A32Float;
+                break;
+
+            case Vertical:
+            case Horizontal:
+                format = FormatR32G32Float;
+                break;
+
+            case ResolveComplexToReal:
+                format = FormatR32G32Float;
+                Nx *= 2;
+                break;
+
+            default:
+                throw logic_error("Invalid input mode.\n");
+        }
+
+        input = context->create_texture(tmp.data(), Nx, Ny, format);
+    }
+
+    if (pass.pass.output_target == SSBO)
+    {
+        output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy);
+    }
+    else
+    {
+        Format format = FormatUnknown;
+        unsigned Nx = pass.pass.Nx;
+        unsigned Ny = pass.pass.Ny;
+
+        switch (pass.pass.mode)
+        {
+            case VerticalDual:
+            case HorizontalDual:
+                format = FormatR32G32B32A32Float;
+                break;
+
+            case Vertical:
+            case Horizontal:
+                format = FormatR32G32Float;
+                break;
+
+            case ResolveRealToComplex:
+                format = FormatR32G32Float;
+                Nx *= 2;
+                break;
+
+            default:
+                throw logic_error("Invalid output mode.\n");
+        }
+
+        output = context->create_texture(nullptr, Nx, Ny, format);
+    }
+
+    // Exhaustive search, look for every sensible combination, and find fastest parameters.
+    // Get initial best cost with defaults.
+    FFTOptions::Performance best_perf;
+    double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache);
+
+    static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };
+    static const unsigned vector_size_values[] = { 2, 4, 8 };
+    static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
+    static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };
+
+    bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex;
+    bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual;
+    unsigned bench_count = 0;
+
+    for (auto shared_banked : shared_banked_values)
+    {
+        // Useless test, since shared banked is only relevant for radix 16/64.
+        if (pass.pass.radix < 16 && shared_banked)
+        {
+            continue;
+        }
+
+        bool fair_shared_banked = (pass.pass.radix < 16) ||
+                                  (static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
+                                  (shared_banked == static_wisdom.shared_banked);
+
+        if (!fair_shared_banked)
+        {
+            continue;
+        }
+
+        for (auto vector_size : vector_size_values)
+        {
+            // Resolve passes currently only support vector size 2. Shared banked makes no sense either.
+            if (test_resolve && (vector_size != 2 || shared_banked))
+            {
+                continue;
+            }
+
+            // We can only use vector_size 8 with FP16.
+            if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16))
+            {
+                continue;
+            }
+
+            // Makes little sense to test since since vector_size will be bumped to 4 anyways.
+            if (test_dual && vector_size < 4)
+            {
+                continue;
+            }
+
+            for (auto workgroup_size_x : workgroup_size_x_values)
+            {
+                for (auto workgroup_size_y : workgroup_size_y_values)
+                {
+                    unsigned workgroup_size  = workgroup_size_x * workgroup_size_y;
+
+                    unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :
+                                                                          static_wisdom.min_workgroup_size;
+
+                    unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
+                    unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;
+
+                    bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
+                                               workgroup_size >= min_workgroup_size;
+                    if (pass.pass.Ny == 1 && workgroup_size_y > 1)
+                    {
+                        fair_workgroup_size = false;
+                    }
+
+                    if (!fair_workgroup_size)
+                    {
+                        continue;
+                    }
+
+                    // If we have dual mode, accept vector sizes larger than max.
+                    bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
+                                                             vector_size >= min_vector_size);
+
+                    if (!fair_vector_size)
+                    {
+                        continue;
+                    }
+
+                    FFTOptions::Performance perf;
+                    perf.shared_banked = shared_banked;
+                    perf.vector_size = vector_size;
+                    perf.workgroup_size_x = workgroup_size_x;
+                    perf.workgroup_size_y = workgroup_size_y;
+
+                    try
+                    {
+                        // If workgroup sizes are too big for our test, this will throw.
+                        double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache);
+                        bench_count++;
+
+#if 1
+                        context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);
+                        context->log("  Width:            %4u\n", pass.pass.Nx);
+                        context->log("  Height:           %4u\n", pass.pass.Ny);
+                        context->log("  Shared banked:     %3s\n", shared_banked ? "yes" : "no");
+                        context->log("  Vector size:         %u\n", vector_size);
+                        context->log("  Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
+                        context->log("  Cost:         %8.3g\n", cost);
+#endif
+
+                        if (cost < minimum_cost)
+                        {
+#if 1
+                            context->log("  New optimal solution! (%g -> %g)\n", minimum_cost, cost);
+#endif
+                            best_perf = perf;
+                            minimum_cost = cost;
+                        }
+                    }
+#ifdef GLFFT_CLI_ASYNC
+                    catch (const AsyncCancellation &)
+                    {
+                        throw;
+                    }
+#endif
+                    catch (...)
+                    {
+                        // If we pass in bogus parameters,
+                        // FFT will throw and we just ignore this.
+                    }
+                }
+            }
+        }
+    }
+
+    context->log("Tested %u variants!\n", bench_count);
+    return make_pair(minimum_cost, best_perf);
+}
+
+const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
+        Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const
+{
+    WisdomPass pass = {
+        {
+            Nx, Ny, radix, mode, input_target, output_target,
+            type,
+        },
+        0.0,
+    };
+
+    auto itr = library.find(pass);
+    return itr != end(library) ? (&(*itr)) : nullptr;
+}
+
+const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
+        Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
+{
+    WisdomPass pass = {
+        {
+            Nx, Ny, radix, mode, input_target, output_target,
+            base_options.type,
+        },
+        0.0,
+    };
+
+    auto itr = library.find(pass);
+
+#if 0
+    if (itr == end(library))
+    {
+        context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
+                Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));
+    }
+#endif
+
+    return itr != end(library) ? itr->second : base_options.performance;
+}
+
+#ifdef GLFFT_SERIALIZATION
+std::string FFTWisdom::archive() const
+{
+    StringBuffer s;
+    PrettyWriter<StringBuffer> writer{s};
+
+    writer.StartObject();
+    writer.String("library");
+
+    // Serialize all wisdom accumulated to a string.
+    writer.StartArray();
+    for (auto &entry : library)
+    {
+        writer.StartObject();
+
+        writer.String("scenario");
+        writer.StartObject();
+        writer.String("nx");
+        writer.Uint(entry.first.pass.Nx);
+        writer.String("ny");
+        writer.Uint(entry.first.pass.Ny);
+        writer.String("radix");
+        writer.Uint(entry.first.pass.radix);
+        writer.String("mode");
+        writer.Uint(entry.first.pass.mode);
+        writer.String("input_target");
+        writer.Uint(entry.first.pass.input_target);
+        writer.String("output_target");
+        writer.Uint(entry.first.pass.output_target);
+        writer.EndObject();
+
+        writer.String("type");
+        writer.StartObject();
+        writer.String("fp16");
+        writer.Bool(entry.first.pass.type.fp16);
+        writer.String("input_fp16");
+        writer.Bool(entry.first.pass.type.input_fp16);
+        writer.String("output_fp16");
+        writer.Bool(entry.first.pass.type.output_fp16);
+        writer.String("normalize");
+        writer.Bool(entry.first.pass.type.normalize);
+        writer.EndObject();
+
+        writer.String("performance");
+        writer.StartObject();
+        writer.String("shared_banked");
+        writer.Bool(entry.second.shared_banked);
+        writer.String("vector_size");
+        writer.Uint(entry.second.vector_size);
+        writer.String("workgroup_size_x");
+        writer.Uint(entry.second.workgroup_size_x);
+        writer.String("workgroup_size_y");
+        writer.Uint(entry.second.workgroup_size_y);
+        writer.EndObject();
+
+        writer.String("cost");
+        writer.Double(entry.first.cost);
+
+        writer.EndObject();
+    }
+    writer.EndArray();
+    writer.EndObject();
+    return s.GetString();
+}
+
+void FFTWisdom::extract(const char *json)
+{
+    Document document;
+    document.Parse(json);
+
+    // Exception safe, we don't want to risk throwing in the middle of the
+    // loop, leaving the library is broken state.
+    unordered_map<WisdomPass, FFTOptions::Performance> new_library;
+
+    auto &lib = document["library"];
+
+    // y u no begin(), end() :(
+    for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr)
+    {
+        auto &v = *itr;
+
+        WisdomPass pass;
+        FFTOptions::Performance perf;
+
+        pass.cost = v["cost"].GetDouble();
+
+        auto &scenario = v["scenario"];
+        pass.pass.Nx = scenario["nx"].GetUint();
+        pass.pass.Ny = scenario["ny"].GetUint();
+        pass.pass.radix = scenario["radix"].GetUint();
+        pass.pass.mode = static_cast<Mode>(scenario["mode"].GetUint());
+        pass.pass.input_target = static_cast<Target>(scenario["input_target"].GetUint());
+        pass.pass.output_target = static_cast<Target>(scenario["output_target"].GetUint());
+
+        auto &type = v["type"];
+        pass.pass.type.fp16 = type["fp16"].GetBool();
+        pass.pass.type.input_fp16 = type["input_fp16"].GetBool();
+        pass.pass.type.output_fp16 = type["output_fp16"].GetBool();
+        pass.pass.type.normalize = type["normalize"].GetBool();
+
+        auto &performance = v["performance"];
+        perf.shared_banked = performance["shared_banked"].GetBool();
+        perf.vector_size = performance["vector_size"].GetUint();
+        perf.workgroup_size_x = performance["workgroup_size_x"].GetUint();
+        perf.workgroup_size_y = performance["workgroup_size_y"].GetUint();
+
+        new_library[pass] = perf;
+    }
+
+    // Exception safe.
+    swap(library, new_library);
+}
+#endif
+
--- a/glfft/glfft_wisdom.hpp
+++ b/glfft/glfft_wisdom.hpp
@@ -0,0 +1,149 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_WISDOM_HPP__
+#define GLFFT_WISDOM_HPP__
+
+#include <unordered_map>
+#include <utility>
+#include <string>
+#include "glfft_common.hpp"
+#include "glfft_interface.hpp"
+
+namespace GLFFT
+{
+
+struct WisdomPass
+{
+    struct
+    {
+        unsigned Nx;
+        unsigned Ny;
+        unsigned radix;
+        Mode mode;
+        Target input_target;
+        Target output_target;
+        FFTOptions::Type type;
+    } pass;
+
+    double cost;
+
+    bool operator==(const WisdomPass &other) const
+    {
+        return std::memcmp(&pass, &other.pass, sizeof(pass)) == 0;
+    }
+};
+
+}
+
+namespace std
+{
+    template<>
+    struct hash<GLFFT::WisdomPass>
+    {
+        std::size_t operator()(const GLFFT::WisdomPass &params) const
+        {
+            std::size_t h = 0;
+            hash<uint8_t> hasher;
+            for (std::size_t i = 0; i < sizeof(params.pass); i++)
+            {
+                h ^= hasher(reinterpret_cast<const uint8_t*>(&params.pass)[i]);
+            }
+
+            return h;
+        }
+    };
+}
+
+namespace GLFFT
+{
+
+// Adds information which depends on the GPU vendor.
+// This can speed up learning process, since there will be fewer "obviously wrong" settings to test.
+struct FFTStaticWisdom
+{
+    enum Tristate { True = 1, False = 0, DontCare = -1 };
+
+    unsigned min_workgroup_size = 1;
+    unsigned min_workgroup_size_shared = 1;
+    unsigned max_workgroup_size = 128; // GLES 3.1 mandates support for this.
+    unsigned min_vector_size = 2;
+    unsigned max_vector_size = 4;
+    Tristate shared_banked = DontCare;
+};
+
+class FFTWisdom
+{
+    public:
+        std::pair<double, FFTOptions::Performance> learn_optimal_options(Context *ctx,
+                unsigned Nx, unsigned Ny, unsigned radix,
+                Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type);
+
+        void learn_optimal_options_exhaustive(Context *ctx,
+                unsigned Nx, unsigned Ny,
+                Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type);
+
+        const std::pair<const WisdomPass, FFTOptions::Performance>* find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
+                Mode mode, Target input_target, Target output_target, const FFTOptions::Type &base_options) const;
+
+        const FFTOptions::Performance& find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
+                Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const;
+
+        void set_static_wisdom(FFTStaticWisdom static_wisdom) { this->static_wisdom = static_wisdom; }
+        static FFTStaticWisdom get_static_wisdom_from_renderer(Context *context);
+
+        void set_bench_params(unsigned warmup,
+                unsigned iterations, unsigned dispatches, double timeout)
+        {
+            params.warmup = warmup;
+            params.iterations = iterations;
+            params.dispatches = dispatches;
+            params.timeout = timeout;
+        }
+
+#ifdef GLFFT_SERIALIZATION
+        // Serialization interface.
+        std::string archive() const;
+        void extract(const char *json);
+#endif
+
+    private:
+        std::unordered_map<WisdomPass, FFTOptions::Performance> library;
+
+        std::pair<double, FFTOptions::Performance> study(Context *context,
+                const WisdomPass &pass, FFTOptions::Type options) const;
+
+        double bench(Context *cmd, Resource *output, Resource *input,
+                const WisdomPass &pass, const FFTOptions &options,
+                const std::shared_ptr<ProgramCache> &cache) const;
+
+        FFTStaticWisdom static_wisdom;
+
+        struct
+        {
+            unsigned warmup = 2;
+            unsigned iterations = 20;
+            unsigned dispatches = 50;
+            double timeout = 1.0;
+        } params;
+};
+
+}
+
+#endif
+