From b26f11ae20bbdd6be7e6509f38a82cbdcf4a392b Mon Sep 17 00:00:00 2001
From: Jarcode <wacossusca34@gmail.com>
Date: Wed, 11 Sep 2019 23:06:03 -0700
Subject: [PATCH] Add GLFFT

---
 glfft/LICENSE_ORIGINAL         |   19 +
 glfft/glfft.cpp                | 1125 ++++++++++++++++++++++++++++++++
 glfft/glfft.hpp                |  225 +++++++
 glfft/glfft_common.hpp         |  178 +++++
 glfft/glfft_gl_api_headers.hpp |    6 +
 glfft/glfft_gl_interface.cpp   |  310 +++++++++
 glfft/glfft_gl_interface.hpp   |  258 ++++++++
 glfft/glfft_interface.hpp      |  131 ++++
 glfft/glfft_wisdom.cpp         |  600 +++++++++++++++++
 glfft/glfft_wisdom.hpp         |  149 +++++
 10 files changed, 3001 insertions(+)
 create mode 100644 glfft/LICENSE_ORIGINAL
 create mode 100644 glfft/glfft.cpp
 create mode 100644 glfft/glfft.hpp
 create mode 100644 glfft/glfft_common.hpp
 create mode 100644 glfft/glfft_gl_api_headers.hpp
 create mode 100644 glfft/glfft_gl_interface.cpp
 create mode 100644 glfft/glfft_gl_interface.hpp
 create mode 100644 glfft/glfft_interface.hpp
 create mode 100644 glfft/glfft_wisdom.cpp
 create mode 100644 glfft/glfft_wisdom.hpp
diff --git a/glfft/LICENSE_ORIGINAL b/glfft/LICENSE_ORIGINAL
new file mode 100644
index 0000000..1d7f765
--- /dev/null
+++ b/glfft/LICENSE_ORIGINAL
@@ -0,0 +1,19 @@
+Copyright (c) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/glfft/glfft.cpp b/glfft/glfft.cpp
new file mode 100644
index 0000000..a4b53b5
--- /dev/null
+++ b/glfft/glfft.cpp
@@ -0,0 +1,1125 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "glfft.hpp"
+#include <algorithm>
+#include <stdexcept>
+#include <fstream>
+#include <sstream>
+#include <numeric>
+#include <assert.h>
+#include <cmath>
+
+#ifdef GLFFT_CLI_ASYNC
+#include "glfft_cli.hpp"
+#endif
+
+/* GLava addition: should be defined by meson */
+#if defined(GLAVA_STANDALONE) || !defined(SHADER_INSTALL_PATH)
+#undef SHADER_INSTALL_PATH
+#define SHADER_INSTALL_PATH "../shaders/glava"
+#endif
+
+using namespace std;
+using namespace GLFFT;
+
+enum Bindings
+{
+    BindingSSBOIn = 0,
+    BindingSSBOOut = 1,
+    BindingSSBOAux = 2,
+    BindingUBO = 3,
+    BindingTexture0 = 4,
+    BindingTexture1 = 5,
+    BindingImage = 6
+};
+
+struct WorkGroupSize
+{
+    unsigned x, y, z;
+};
+
+struct Radix
+{
+    WorkGroupSize size;
+    unsigned num_workgroups_x;
+    unsigned num_workgroups_y;
+    unsigned radix;
+    unsigned vector_size;
+    bool shared_banked;
+};
+
+static unsigned next_pow2(unsigned v)
+{
+    v--;
+    v |= v >> 16;
+    v |= v >> 8;
+    v |= v >> 4;
+    v |= v >> 2;
+    v |= v >> 1;
+    return v + 1;
+}
+
+static void reduce(unsigned &wg_size, unsigned &divisor)
+{
+    if (divisor > 1 && wg_size >= divisor)
+    {
+        wg_size /= divisor;
+        divisor = 1;
+    }
+    else if (divisor > 1 && wg_size < divisor)
+    {
+        divisor /= wg_size;
+        wg_size = 1;
+    }
+}
+
+static unsigned radix_to_wg_z(unsigned radix)
+{
+    switch (radix)
+    {
+        case 16:
+            return 4;
+
+        case 64:
+            return 8;
+
+        default:
+            return 1;
+    }
+}
+
+static Radix build_radix(unsigned Nx, unsigned Ny,
+        Mode mode, unsigned vector_size, bool shared_banked, unsigned radix,
+        WorkGroupSize size,
+        bool pow2_stride)
+{
+    unsigned wg_x = 0, wg_y = 0;
+
+    if (Ny == 1 && size.y > 1)
+    {
+        throw logic_error("WorkGroupSize.y must be 1, when Ny == 1.\n");
+    }
+
+    // To avoid too many threads per workgroup due to workgroup_size_z,
+    // try to divide workgroup_size_y, then workgroup_size_x.
+    // TODO: Make a better constraint solver which takes into account cache line sizes,
+    // and image swizzling patterns, etc ... Not that critical though, since wisdom interface
+    // will find the optimal options despite this.
+    unsigned divisor = size.z;
+    reduce(size.y, divisor);
+    reduce(size.x, divisor);
+
+    switch (mode)
+    {
+        case Vertical:
+            // If we have pow2_stride, we need to transform 2^n + 1 elements horizontally,
+            // so just add a single workgroup in X.
+            // We pad by going up to pow2 stride anyways.
+            // We will transform some garbage,
+            // but it's better than transforming close to double the amount.
+            wg_x = (2 * Nx) / (vector_size * size.x) + pow2_stride;
+            wg_y = Ny / (size.y * radix);
+            break;
+
+        case VerticalDual:
+            vector_size = max(vector_size, 4u);
+            wg_x = (4 * Nx) / (vector_size * size.x);
+            wg_y = Ny / (size.y * radix);
+            break;
+
+        case Horizontal:
+            wg_x = (2 * Nx) / (vector_size * radix * size.x);
+            wg_y = Ny / size.y;
+            break;
+
+        case HorizontalDual:
+            vector_size = max(vector_size, 4u);
+            wg_x = (4 * Nx) / (vector_size * radix * size.x);
+            wg_y = Ny / size.y;
+            break;
+
+        default:
+            assert(0);
+    }
+
+    return { size, wg_x, wg_y, radix, vector_size, shared_banked };
+}
+
+// Resolve radices are simpler, and don't yet support different vector sizes, etc.
+static Radix build_resolve_radix(unsigned Nx, unsigned Ny, WorkGroupSize size)
+{
+    return { size, Nx / size.x, Ny / size.y, 2, 2, false };
+}
+
+// Smaller FFT with larger workgroups are not always possible to create.
+static bool is_radix_valid(unsigned Nx, unsigned Ny,
+        Mode mode, unsigned vector_size, unsigned radix,
+        WorkGroupSize size,
+        bool pow2_stride)
+{
+    auto res = build_radix(Nx, Ny,
+            mode, vector_size, false, radix,
+            size,
+            pow2_stride);
+
+    return res.num_workgroups_x > 0 && res.num_workgroups_y > 0;
+}
+
+static double find_cost(unsigned Nx, unsigned Ny, Mode mode, unsigned radix,
+        const FFTOptions &options, const FFTWisdom &wisdom)
+{
+    auto opt = wisdom.find_optimal_options(Nx, Ny, radix, mode, SSBO, SSBO, options.type);
+
+    // Return a very rough estimate if we cannot find cost.
+    // The cost functions generated here are expected to be huge,
+    // always much larger than true cost functions.
+    // The purpose of this is to give a strong bias towards radices we have wisdom for.
+    // We also give a bias towards larger radices, since they are generally more BW efficient.
+    return opt ? opt->first.cost : Nx * Ny * (log2(float(radix)) + 2.0f);
+}
+
+struct CostPropagate
+{
+    CostPropagate() = default;
+    CostPropagate(double cost, vector<unsigned> radices)
+        : cost(cost), radices(move(radices)) {}
+
+    void merge_if_better(const CostPropagate &a, const CostPropagate &b)
+    {
+        double new_cost = a.cost + b.cost;
+
+        if ((cost == 0.0 || new_cost < cost) && a.cost != 0.0 && b.cost != 0.0)
+        {
+            cost = new_cost;
+            radices = a.radices;
+            radices.insert(end(radices), begin(b.radices), end(b.radices));
+        }
+    }
+
+    double cost = 0.0;
+    vector<unsigned> radices;
+};
+
+static vector<Radix> split_radices(unsigned Nx, unsigned Ny, Mode mode, Target input_target, Target output_target,
+        const FFTOptions &options,
+        bool pow2_stride, const FFTWisdom &wisdom, double &accumulate_cost)
+{
+    unsigned N;
+    switch (mode)
+    {
+        case Vertical:
+        case VerticalDual:
+            N = Ny;
+            break;
+
+        case Horizontal:
+        case HorizontalDual:
+            N = Nx;
+            break;
+
+        default:
+            return {};
+    }
+
+    // N == 1 is for things like Nx1 transforms where we don't do any vertical transforms.
+    if (N == 1)
+    {
+        return {};
+    }
+
+    // Treat cost 0.0 as invalid.
+    double cost_table[8] = {0.0};
+    CostPropagate cost_propagate[32];
+
+    // Fill table with fastest known ways to do radix 4, radix 8, radix 16, and 64.
+    // We'll then find the optimal subdivision which has the lowest additive cost.
+    cost_table[2] = find_cost(Nx, Ny, mode,  4, options, wisdom);
+    cost_table[3] = find_cost(Nx, Ny, mode,  8, options, wisdom);
+    cost_table[4] = find_cost(Nx, Ny, mode, 16, options, wisdom);
+    cost_table[6] = find_cost(Nx, Ny, mode, 64, options, wisdom);
+
+    auto is_valid = [&](unsigned radix) -> bool {
+        unsigned workgroup_size_z = radix_to_wg_z(radix);
+        auto &opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);
+
+        // We don't want pow2_stride to round up a very inefficient work group and make the is_valid test pass.
+        return is_radix_valid(Nx, Ny,
+                mode, opt.vector_size, radix,
+                { opt.workgroup_size_x, opt.workgroup_size_y, workgroup_size_z },
+                false);
+    };
+
+    // If our work-space is too small to allow certain radices, we disable them from consideration here.
+    for (unsigned i = 2; i <= 6; i++)
+    {
+        // Don't check the composite radix.
+        if (i == 5)
+        {
+            continue;
+        }
+
+        if (is_valid(1 << i))
+        {
+            cost_propagate[i] = CostPropagate(cost_table[i], { 1u << i });
+        }
+    }
+
+    // Now start bubble this up all the way to N, starting from radix 16.
+    for (unsigned i = 4; (1u << i) <= N; i++)
+    {
+        auto &target = cost_propagate[i];
+
+        for (unsigned r = 2; i - r >= r; r++)
+        {
+            target.merge_if_better(cost_propagate[r], cost_propagate[i - r]);
+        }
+
+        if ((1u << i) == N && target.cost == 0.0)
+        {
+            throw logic_error("There is no possible subdivision ...\n");
+        }
+    }
+
+    // Ensure that the radix splits are sensible.
+    // A radix-N non p-1 transform mandates that p factor is at least N.
+    // Sort the splits so that larger radices come first.
+    // For composite radices like 16 and 64, they are built with 4x4 and 8x8, so we only
+    // need p factors for 4 and 8 for those cases.
+    // The cost function doesn't depend in which order we split the radices.
+    auto &cost = cost_propagate[unsigned(log2(float(N)))];
+    auto radices = move(cost.radices);
+
+    sort(begin(radices), end(radices), greater<unsigned>());
+
+    if (accumulate(begin(radices), end(radices), 1u, multiplies<unsigned>()) != N)
+    {
+        throw logic_error("Radix splits are invalid.");
+    }
+
+    vector<Radix> radices_out;
+    radices_out.reserve(radices.size());
+
+    // Fill in the structs with all information.
+    for (auto radix : radices)
+    {
+        bool first = radices_out.empty();
+        bool last = radices_out.size() + 1 == radices.size();
+
+        // Use known performance options as a fallback.
+        // We used SSBO -> SSBO cost functions to find the optimal radix splits,
+        // but replace first and last options with Image -> SSBO / SSBO -> Image cost functions if appropriate.
+        auto &orig_opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);
+        auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode,
+                first ? input_target : SSBO,
+                last ? output_target : SSBO,
+                { orig_opt, options.type });
+
+        radices_out.push_back(build_radix(Nx, Ny,
+                    mode, opts.vector_size, opts.shared_banked, radix,
+                    { opts.workgroup_size_x, opts.workgroup_size_y, radix_to_wg_z(radix) },
+                    pow2_stride));
+    }
+
+    accumulate_cost += cost.cost;
+    return radices_out;
+}
+
+Program* ProgramCache::find_program(const Parameters &parameters) const
+{
+    auto itr = programs.find(parameters);
+    if (itr != end(programs))
+    {
+        return itr->second.get();
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void ProgramCache::insert_program(const Parameters &parameters, std::unique_ptr<Program> program)
+{
+    programs[parameters] = move(program);
+}
+
+Program* FFT::get_program(const Parameters &params)
+{
+    Program *prog = cache->find_program(params);
+    if (!prog)
+    {
+        auto newprog = build_program(params);
+        if (!newprog)
+        {
+            throw runtime_error("Failed to compile shader.\n");
+        }
+        prog = newprog.get();
+        cache->insert_program(params, move(newprog));
+    }
+    return prog;
+}
+
+static inline unsigned mode_to_input_components(Mode mode)
+{
+    switch (mode)
+    {
+        case HorizontalDual:
+        case VerticalDual:
+            return 4;
+
+        case Horizontal:
+        case Vertical:
+        case ResolveComplexToReal:
+            return 2;
+
+        case ResolveRealToComplex:
+            return 1;
+
+        default:
+            return 0;
+    }
+}
+
+FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
+        unsigned radix, unsigned p,
+        Mode mode, Target input_target, Target output_target,
+        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options)
+    : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
+{
+    set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
+
+    if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
+    {
+        throw logic_error("FFT size is not POT.");
+    }
+
+    if (p != 1 && input_target != SSBO)
+    {
+        throw logic_error("P != 1 only supported with SSBO as input.");
+    }
+
+    if (p < radix && output_target != SSBO)
+    {
+        throw logic_error("P < radix only supported with SSBO as output.");
+    }
+
+    // We don't really care about transform direction since it's just a matter of sign-flipping twiddles,
+    // but we have to obey some fundamental assumptions of resolve passes.
+    Direction direction = mode == ResolveComplexToReal ? Inverse : Forward;
+
+    Radix res;
+    if (mode == ResolveRealToComplex || mode == ResolveComplexToReal)
+    {
+        res = build_resolve_radix(Nx, Ny, { options.performance.workgroup_size_x, options.performance.workgroup_size_y, 1 });
+    }
+    else
+    {
+        res = build_radix(Nx, Ny,
+                mode, options.performance.vector_size, options.performance.shared_banked, radix,
+                { options.performance.workgroup_size_x, options.performance.workgroup_size_y, radix_to_wg_z(radix) },
+                false);
+    }
+
+    const Parameters params = {
+        res.size.x,
+        res.size.y,
+        res.size.z,
+        res.radix,
+        res.vector_size,
+        direction,
+        mode,
+        input_target,
+        output_target,
+        p == 1,
+        res.shared_banked,
+        options.type.fp16, options.type.input_fp16, options.type.output_fp16,
+        options.type.normalize,
+    };
+
+    if (res.num_workgroups_x == 0 || res.num_workgroups_y == 0)
+    {
+        throw logic_error("Invalid workgroup sizes for this radix.");
+    }
+
+    unsigned uv_scale_x = res.vector_size / mode_to_input_components(mode);
+    const Pass pass = {
+        params,
+        res.num_workgroups_x, res.num_workgroups_y,
+        uv_scale_x,
+        next_pow2(res.num_workgroups_x * params.workgroup_size_x),
+        get_program(params),
+    };
+
+    passes.push_back(pass);
+}
+
+static inline void print_radix_splits(Context *context, const vector<Radix> radices[2])
+{
+    context->log("Transform #1\n");
+    for (auto &radix : radices[0])
+    {
+        context->log("  Size: (%u, %u, %u)\n",
+                radix.size.x, radix.size.y, radix.size.z);
+        context->log("  Dispatch: (%u, %u)\n",
+                radix.num_workgroups_x, radix.num_workgroups_y);
+        context->log("  Radix: %u\n",
+                radix.radix);
+        context->log("  VectorSize: %u\n\n",
+                radix.vector_size);
+    }
+
+    context->log("Transform #2\n");
+    for (auto &radix : radices[1])
+    {
+        context->log("  Size: (%u, %u, %u)\n",
+                radix.size.x, radix.size.y, radix.size.z);
+        context->log("  Dispatch: (%u, %u)\n",
+                radix.num_workgroups_x, radix.num_workgroups_y);
+        context->log("  Radix: %u\n",
+                radix.radix);
+        context->log("  VectorSize: %u\n\n",
+                radix.vector_size);
+    }
+}
+
+static inline unsigned type_to_input_components(Type type)
+{
+    switch (type)
+    {
+        case ComplexToComplex:
+        case ComplexToReal:
+            return 2;
+
+        case RealToComplex:
+            return 1;
+
+        case ComplexToComplexDual:
+            return 4;
+
+        default:
+            return 0;
+    }
+}
+
+FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
+        Type type, Direction direction, Target input_target, Target output_target,
+        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom)
+    : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
+{
+    set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
+
+    size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
+    temp_buffer_size >>= options.type.output_fp16;
+
+    temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
+    if (output_target != SSBO)
+    {
+        temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
+    }
+
+    bool expand = false;
+    if (type == ComplexToReal || type == RealToComplex)
+    {
+        // If we're doing C2R or R2C, we'll need double the scratch memory,
+        // so make sure we're dividing Nx *after* allocating.
+        Nx /= 2;
+        expand = true;
+    }
+
+    // Sanity checks.
+    if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
+    {
+        throw logic_error("FFT size is not POT.");
+    }
+
+    if (type == ComplexToReal && direction == Forward)
+    {
+        throw logic_error("ComplexToReal transforms requires inverse transform.");
+    }
+
+    if (type == RealToComplex && direction != Forward)
+    {
+        throw logic_error("RealToComplex transforms requires forward transform.");
+    }
+
+    if (type == RealToComplex && input_target == Image)
+    {
+        throw logic_error("Input real-to-complex must use ImageReal target.");
+    }
+
+    if (type == ComplexToReal && output_target == Image)
+    {
+        throw logic_error("Output complex-to-real must use ImageReal target.");
+    }
+
+    vector<Radix> radices[2];
+    Mode modes[2];
+    Target targets[4];
+
+    switch (direction)
+    {
+        case Forward:
+            modes[0] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
+            modes[1] = type == ComplexToComplexDual ? VerticalDual : Vertical;
+
+            targets[0] = input_target;
+            targets[1] = Ny > 1 ? SSBO : output_target;
+            targets[2] = targets[1];
+            targets[3] = output_target;
+
+            radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, false, wisdom, cost);
+            radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, expand, wisdom, cost);
+            break;
+
+        case Inverse:
+        case InverseConvolve:
+            modes[0] = type == ComplexToComplexDual ? VerticalDual : Vertical;
+            modes[1] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
+
+            targets[0] = input_target;
+            targets[1] = Ny > 1 ? SSBO : input_target;
+            targets[2] = targets[1];
+            targets[3] = output_target;
+
+            radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, expand, wisdom, cost);
+            radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, false, wisdom, cost);
+            break;
+    }
+
+#if 0
+    print_radix_splits(context, radices);
+#endif
+
+    passes.reserve(radices[0].size() + radices[1].size() + expand);
+
+    unsigned index = 0;
+    unsigned last_index = (radices[1].empty() && !expand) ? 0 : 1;
+
+    for (auto &radix_direction : radices)
+    {
+        unsigned p = 1;
+        unsigned i = 0;
+        
+        for (auto &radix : radix_direction)
+        {
+            // If this is the last pass and we're writing to an image, use a special shader variant.
+            bool last_pass = index == last_index && i == radix_direction.size() - 1;
+
+            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
+            Target out_target = last_pass ? output_target : SSBO;
+            Target in_target = passes.empty() ? input_target : SSBO;
+            Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
+            unsigned uv_scale_x = radix.vector_size / type_to_input_components(type);
+
+            const Parameters params = {
+                radix.size.x,
+                radix.size.y,
+                radix.size.z,
+                radix.radix,
+                radix.vector_size,
+                dir,
+                modes[index],
+                in_target,
+                out_target,
+                p == 1,
+                radix.shared_banked,
+                options.type.fp16, input_fp16, options.type.output_fp16,
+                options.type.normalize,
+            };
+
+            const Pass pass = {
+                params,
+                radix.num_workgroups_x, radix.num_workgroups_y,
+                uv_scale_x,
+                next_pow2(radix.num_workgroups_x * params.workgroup_size_x),
+                get_program(params),
+            };
+
+            passes.push_back(pass);
+
+            p *= radix.radix;
+            i++;
+        }
+
+        // After the first transform direction, inject either a real-to-complex resolve or complex-to-real resolve.
+        // This way, we avoid having special purpose transforms for all FFT variants.
+        if (index == 0 && (type == ComplexToReal || type == RealToComplex))
+        {
+            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
+            bool last_pass = radices[1].empty();
+            Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
+            Target in_target = passes.empty() ? input_target : SSBO;
+            Target out_target = last_pass ? output_target : SSBO;
+            Mode mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
+            unsigned uv_scale_x = 1;
+
+            auto base_opts = options;
+            base_opts.type.input_fp16 = input_fp16;
+
+            auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts);
+            auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });
+
+            const Parameters params = {
+                res.size.x,
+                res.size.y,
+                res.size.z,
+                res.radix,
+                res.vector_size,
+                dir,
+                mode,
+                in_target,
+                out_target,
+                true,
+                false,
+                base_opts.type.fp16, base_opts.type.input_fp16, base_opts.type.output_fp16,
+                base_opts.type.normalize,
+            };
+
+            const Pass pass = {
+                params,
+                Nx / res.size.x,
+                Ny / res.size.y,
+                uv_scale_x,
+                next_pow2(Nx),
+                get_program(params),
+            };
+
+            passes.push_back(pass);
+        }
+
+        index++;
+    }
+}
+
+string FFT::load_shader_string(const char *path)
+{
+    ifstream file(path);
+    if (!file.good())
+    {
+        throw runtime_error("Failed to load shader file from disk.\n");
+    }
+    stringstream buf;
+    buf << file.rdbuf();
+    return buf.str();
+}
+
+void FFT::store_shader_string(const char *path, const string &source)
+{
+    ofstream file(path);
+    file.write(source.data(), source.size());
+}
+
+unique_ptr<Program> FFT::build_program(const Parameters &params)
+{
+    string str;
+    str.reserve(16 * 1024);
+
+#if 0
+    context->log("Building program:\n");
+    context->log(
+            "   WG_X:      %u\n"
+            "   WG_Y:      %u\n"
+            "   WG_Z:      %u\n"
+            "   P1:        %u\n"
+            "   Radix:     %u\n"
+            "   Dir:       %d\n"
+            "   Mode:      %u\n"
+            "   InTarget:  %u\n"
+            "   OutTarget: %u\n"
+            "   FP16:      %u\n"
+            "   InFP16:    %u\n"
+            "   OutFP16:   %u\n"
+            "   Norm:      %u\n",
+            params.workgroup_size_x,
+            params.workgroup_size_y,
+            params.workgroup_size_z,
+            params.p1,
+            params.radix,
+            params.direction,
+            params.mode,
+            params.input_target,
+            params.output_target,
+            params.fft_fp16,
+            params.input_fp16,
+            params.output_fp16,
+            params.fft_normalize);
+#endif
+
+    if (params.p1)
+    {
+        str += "#define FFT_P1\n";
+    }
+
+    if (params.fft_fp16)
+    {
+        str += "#define FFT_FP16\n";
+    }
+
+    if (params.input_fp16)
+    {
+        str += "#define FFT_INPUT_FP16\n";
+    }
+
+    if (params.output_fp16)
+    {
+        str += "#define FFT_OUTPUT_FP16\n";
+    }
+
+    if (params.fft_normalize)
+    {
+        str += "#define FFT_NORMALIZE\n";
+    }
+
+    if (params.direction == InverseConvolve)
+    {
+        str += "#define FFT_CONVOLVE\n";
+    }
+
+    str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n";
+
+    str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n";
+    str += string("#define FFT_RADIX ") + to_string(params.radix) + "\n";
+    
+    unsigned vector_size = params.vector_size;
+    switch (params.mode)
+    {
+        case VerticalDual:
+            str += "#define FFT_DUAL\n";
+            str += "#define FFT_VERT\n";
+            break;
+
+        case Vertical:
+            str += "#define FFT_VERT\n";
+            break;
+
+        case HorizontalDual:
+            str += "#define FFT_DUAL\n";
+            str += "#define FFT_HORIZ\n";
+            break;
+
+        case Horizontal:
+            str += "#define FFT_HORIZ\n";
+            break;
+
+        case ResolveRealToComplex:
+            str += "#define FFT_RESOLVE_REAL_TO_COMPLEX\n";
+            str += "#define FFT_HORIZ\n";
+            vector_size = 2;
+            break;
+
+        case ResolveComplexToReal:
+            str += "#define FFT_RESOLVE_COMPLEX_TO_REAL\n";
+            str += "#define FFT_HORIZ\n";
+            vector_size = 2;
+            break;
+    }
+
+    switch (params.input_target)
+    {
+        case ImageReal:
+            str += "#define FFT_INPUT_REAL\n";
+            // Fallthrough
+        case Image:
+            str += "#define FFT_INPUT_TEXTURE\n";
+            break;
+
+        default:
+            break;
+    }
+
+    switch (params.output_target)
+    {
+        case ImageReal:
+            str += "#define FFT_OUTPUT_REAL\n";
+            // Fallthrough
+        case Image:
+            str += "#define FFT_OUTPUT_IMAGE\n";
+            break;
+
+        default:
+            break;
+    }
+
+    switch (vector_size)
+    {
+        case 2:
+            str += "#define FFT_VEC2\n";
+            break;
+
+        case 4:
+            str += "#define FFT_VEC4\n";
+            break;
+
+        case 8:
+            str += "#define FFT_VEC8\n";
+            break;
+    }
+
+    str += string("layout(local_size_x = ") +
+        to_string(params.workgroup_size_x) +
+        ", local_size_y = " +
+        to_string(params.workgroup_size_y) +
+        ", local_size_z = " +
+        to_string(params.workgroup_size_z) +
+        ") in;\n";
+    
+    str += load_shader_string("glfft/glsl/fft_common.comp");
+    switch (params.radix)
+    {
+        case 4:
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp");
+            break;
+
+        case 8:
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp");
+            break;
+
+        case 16:
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp");
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp");
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix16.comp");
+            break;
+
+        case 64:
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp");
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp");
+            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix64.comp");
+            break;
+    }
+    str += load_shader_string("glfft/glsl/fft_main.comp");
+
+    auto prog = context->compile_compute_shader(str.c_str());
+    if (!prog)
+    {
+        puts(str.c_str());
+    }
+
+#if 0
+    char shader_path[1024];
+    snprintf(shader_path, sizeof(shader_path), "glfft_shader_radix%u_first%u_mode%u_in_target%u_out_target%u.comp.src",
+            params.radix, params.p1, params.mode, unsigned(params.input_target), unsigned(params.output_target));
+    store_shader_string(shader_path, str);
+#endif
+
+    return prog;
+}
+
+double FFT::bench(Context *context, Resource *output, Resource *input,
+        unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time)
+{
+    context->wait_idle();
+    auto *cmd = context->request_command_buffer();
+    for (unsigned i = 0; i < warmup_iterations; i++)
+    {
+        process(cmd, output, input);
+    }
+    context->submit_command_buffer(cmd);
+    context->wait_idle();
+
+    unsigned runs = 0;
+    double start_time = context->get_time();
+    double total_time = 0.0;
+
+    for (unsigned i = 0; i < iterations && (((context->get_time() - start_time) < max_time) || i == 0); i++)
+    {
+#ifdef GLFFT_CLI_ASYNC
+        check_async_cancel();
+#endif
+
+        auto *cmd = context->request_command_buffer();
+
+        double iteration_start = context->get_time();
+        for (unsigned d = 0; d < dispatches_per_iteration; d++)
+        {
+            process(cmd, output, input);
+            cmd->barrier();
+            runs++;
+        }
+
+        context->submit_command_buffer(cmd);
+        context->wait_idle();
+
+        double iteration_end = context->get_time();
+        total_time += iteration_end - iteration_start;
+    }
+
+    return total_time / runs;
+}
+
+void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux)
+{
+    if (passes.empty())
+    {
+        return;
+    }
+
+    Resource *buffers[2] = {
+        input,
+        passes.size() & 1 ?
+            (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) :
+            temp_buffer.get(),
+    };
+
+    if (input_aux != 0)
+    {
+        if (passes.front().parameters.input_target != SSBO)
+        {
+            cmd->bind_texture(BindingTexture1, static_cast<Texture*>(input_aux));
+            cmd->bind_sampler(BindingTexture1, texture.samplers[1]);
+        }
+        else
+        {
+            if (ssbo.input_aux.size != 0)
+            {
+                cmd->bind_storage_buffer_range(BindingSSBOAux,
+                        ssbo.input_aux.offset, ssbo.input_aux.size, static_cast<Buffer*>(input_aux));
+            }
+            else
+            {
+                cmd->bind_storage_buffer(BindingSSBOAux, static_cast<Buffer*>(input_aux));
+            }
+        }
+    }
+
+    Program *current_program = nullptr;
+    unsigned p = 1;
+    unsigned pass_index = 0;
+
+    struct FFTConstantData
+    {
+        uint32_t p;
+        uint32_t stride;
+        uint32_t padding[2];
+        float offset_x, offset_y;
+        float scale_x, scale_y;
+    };
+
+    for (auto &pass : passes)
+    {
+        if (pass.program != current_program)
+        {
+            cmd->bind_program(pass.program);
+            current_program = pass.program;
+        }
+
+        if (pass.parameters.p1)
+        {
+            p = 1;
+        }
+
+        FFTConstantData constant_data;
+        constant_data.p = p;
+        constant_data.stride = pass.stride;
+        p *= pass.parameters.radix;
+
+        if (pass.parameters.input_target != SSBO)
+        {
+            cmd->bind_texture(BindingTexture0, static_cast<Texture*>(buffers[0]));
+            cmd->bind_sampler(BindingTexture0, texture.samplers[0]);
+
+            // If one compute thread reads multiple texels in X dimension, scale this accordingly.
+            float scale_x = texture.scale_x * pass.uv_scale_x;
+
+            constant_data.offset_x = texture.offset_x;
+            constant_data.offset_y = texture.offset_y;
+            constant_data.scale_x = scale_x;
+            constant_data.scale_y = texture.scale_y;
+        }
+        else
+        {
+            if (buffers[0] == input && ssbo.input.size != 0)
+            {
+                cmd->bind_storage_buffer_range(BindingSSBOIn,
+                        ssbo.input.offset, ssbo.input.size, static_cast<Buffer*>(buffers[0]));
+            }
+            else if (buffers[0] == output && ssbo.output.size != 0)
+            {
+                cmd->bind_storage_buffer_range(BindingSSBOIn,
+                        ssbo.output.offset, ssbo.output.size, static_cast<Buffer*>(buffers[0]));
+            }
+            else
+            {
+                cmd->bind_storage_buffer(BindingSSBOIn, static_cast<Buffer*>(buffers[0]));
+            }
+        }
+
+        if (pass.parameters.output_target != SSBO)
+        {
+            Format format = FormatUnknown;
+
+            // TODO: Make this more flexible, would require shader variants per-format though.
+            if (pass.parameters.output_target == ImageReal)
+            {
+                format = FormatR32Float;
+            }
+            else
+            {
+                switch (pass.parameters.mode)
+                {
+                    case VerticalDual:
+                    case HorizontalDual:
+                        format = FormatR16G16B16A16Float;
+                        break;
+
+                    case Vertical:
+                    case Horizontal:
+                    case ResolveRealToComplex:
+                        format = FormatR32Uint;
+                        break;
+
+                    default:
+                        break;
+                }
+            }
+            cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(output), format);
+        }
+        else
+        {
+            if (buffers[1] == output && ssbo.output.size != 0)
+            {
+                cmd->bind_storage_buffer_range(BindingSSBOOut,
+                        ssbo.output.offset, ssbo.output.size, static_cast<Buffer*>(buffers[1]));
+            }
+            else
+            {
+                cmd->bind_storage_buffer(BindingSSBOOut, static_cast<Buffer*>(buffers[1]));
+            }
+        }
+
+        cmd->push_constant_data(BindingUBO, &constant_data, sizeof(constant_data));
+        cmd->dispatch(pass.workgroups_x, pass.workgroups_y, 1);
+
+        // For last pass, we don't know how our resource will be used afterwards,
+        // so let barrier decisions be up to the API user.
+        if (pass_index + 1 < passes.size())
+        {
+            cmd->barrier(static_cast<Buffer*>(buffers[1]));
+        }
+
+        if (pass_index == 0)
+        {
+            buffers[0] = passes.size() & 1 ?
+                temp_buffer.get() :
+                (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output);
+        }
+
+        swap(buffers[0], buffers[1]);
+        pass_index++;
+    }
+}
+
diff --git a/glfft/glfft.hpp b/glfft/glfft.hpp
new file mode 100644
index 0000000..c3b1a69
--- /dev/null
+++ b/glfft/glfft.hpp
@@ -0,0 +1,225 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_HPP__
+#define GLFFT_HPP__
+
+#include "glfft_interface.hpp"
+#include "glfft_common.hpp"
+#include "glfft_wisdom.hpp"
+#include <vector>
+#include <unordered_map>
+#include <limits>
+
+/// GLFFT doesn't try to preserve GL state in any way.
+/// E.g. SHADER_STORAGE_BUFFER bindings, programs bound, texture bindings, etc.
+/// Applications calling this library must expect that some GL state will be modified.
+/// No rendering state associated with graphics will be modified.
+
+namespace GLFFT
+{
+
+class FFT
+{
+    public:
+        /// @brief Creates a full FFT.
+        ///
+        /// All buffer allocation done by GLFFT will be done in constructor.
+        /// Will throw if invalid parameters are passed.
+        ///
+        /// @param context       The graphics context.
+        /// @param Nx            Number of samples in horizontal dimension.
+        /// @param Ny            Number of samples in vertical dimension.
+        /// @param type          The transform type.
+        /// @param direction     Forward, inverse or inverse with convolution.
+        ///                      For real-to-complex and complex-to-real transforms, the
+        ///                      transform type must match.
+        /// @param input_target  GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
+        /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
+        /// @param cache         A program cache for caching the GLFFT programs created.
+        /// @param options       FFT options such as performance related parameters and types.
+        /// @param wisdom        GLFFT wisdom which can override performance related options
+        ///                      (options.performance is used as a fallback).
+        FFT(Context *context, unsigned Nx, unsigned Ny,
+                Type type, Direction direction, Target input_target, Target output_target,
+                std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
+                const FFTWisdom &wisdom = FFTWisdom());
+
+        /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
+        ///
+        /// All buffer allocation done by GLFFT will be done in constructor.
+        /// Will throw if invalid parameters are passed.
+        ///
+        /// @param context       The graphics context.
+        /// @param Nx            Number of samples in horizontal dimension.
+        /// @param Ny            Number of samples in vertical dimension.
+        /// @param radix         FFT radix to test.
+        /// @param p             Accumulated p factor. If 1, "first pass" mode is tested, otherwise, generic FFT stages.
+        /// @param mode          The transform mode.
+        /// @param input_target  GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
+        /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
+        /// @param cache         A program cache for caching the GLFFT programs created.
+        /// @param options       FFT options such as performance related parameters and types.
+        FFT(Context *context, unsigned Nx, unsigned Ny, unsigned radix, unsigned p,
+                Mode mode, Target input_target, Target output_target,
+                std::shared_ptr<ProgramCache> cache, const FFTOptions &options);
+
+        /// @brief Process the FFT.
+        ///
+        /// The type of object passed here must match what FFT was initialized with.
+        ///
+        /// @param cmd       Command buffer for issuing dispatch commands.
+        /// @param output    Output buffer or image.
+        ///                  NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
+        /// @param input     Input buffer or texture.
+        /// @param input_aux If using convolution transform type,
+        ///                  the content of input and input_aux will be multiplied together.
+        void process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux = nullptr);
+
+        /// @brief Run process() multiple times, timing the results.
+        ///
+        /// Mostly used internally by GLFFT wisdom, glfft_cli's bench, and so on.
+        ///
+        /// @param context                  The graphics context.
+        /// @param output                   Output buffer or image.
+        ///                                 NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
+        /// @param input                    Input buffer or texture.
+        /// @param warmup_iterations        Number of iterations to run to "warm" up GL, ensures we don't hit
+        ///                                 recompilations or similar when benching.
+        /// @param iterations               Number of iterations to run the benchmark.
+        ///                                 Each iteration will ensure timing with a glFinish() followed by timing.
+        /// @param dispatches_per_iteration Number of calls to process() we should do per iteration.
+        /// @param max_time                 The max time the benchmark should run. Will be checked after each iteration is complete.
+        ///
+        /// @returns Average GPU time per process() call.
+        double bench(Context *context, Resource *output, Resource *input,
+                unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration,
+                double max_time = std::numeric_limits<double>::max());
+
+        /// @brief Returns cost for a process() call. Only used for debugging.
+        double get_cost() const { return cost; }
+
+        /// @brief Returns number of passes (glDispatchCompute) in a process() call.
+        unsigned get_num_passes() const { return passes.size(); }
+
+        /// @brief Returns Nx.
+        unsigned get_dimension_x() const { return size_x; }
+        /// @brief Returns Ny.
+        unsigned get_dimension_y() const { return size_y; }
+
+        /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures.
+        ///
+        /// By default, these values are 0.5 / size (samples in the center of texel (0, 0)).
+        /// Scale is 1.0 / size, so it steps one texel for each coordinate in the FFT transform.
+        /// Setting this to something custom is useful to get downsampling with GL_LINEAR -> FFT transform
+        /// without having to downsample the texture first, then FFT.
+        void set_texture_offset_scale(float offset_x, float offset_y, float scale_x, float scale_y)
+        {
+            texture.offset_x = offset_x;
+            texture.offset_y = offset_y;
+            texture.scale_x = scale_x;
+            texture.scale_y = scale_y;
+        }
+
+        /// @brief Set binding range for input.
+        ///
+        /// If input is an SSBO, set a custom binding range to be passed to glBindBufferRange.
+        /// By default, the entire buffer is bound.
+        void set_input_buffer_range(size_t offset, size_t size)
+        {
+            ssbo.input.offset = offset;
+            ssbo.input.size = size;
+        }
+
+        /// @brief Set binding range for input_aux.
+        ///
+        /// If input_aux is an SSBO, set a custom binding range to be passed to glBindBufferRange.
+        /// By default, the entire buffer is bound.
+        void set_input_aux_buffer_range(size_t offset, size_t size)
+        {
+            ssbo.input_aux.offset = offset;
+            ssbo.input_aux.size = size;
+        }
+
+        /// @brief Set binding range for output.
+        ///
+        /// If output buffer is an SSBO, set a custom binding range to be passed to glBindBufferRange.
+        /// By default, the entire buffer is bound.
+        void set_output_buffer_range(size_t offset, size_t size)
+        {
+            ssbo.output.offset = offset;
+            ssbo.output.size = size;
+        }
+
+        /// @brief Set samplers for input textures.
+        ///
+        /// Set sampler objects to be used for input and input_aux if textures are used as input.
+        /// By default, sampler object 0 will be used (inheriting sampler parameters from the texture object itself).
+        void set_samplers(Sampler *sampler0, Sampler *sampler1 = nullptr)
+        {
+            texture.samplers[0] = sampler0;
+            texture.samplers[1] = sampler1;
+        }
+
+    private:
+        Context *context;
+
+        struct Pass
+        {
+            Parameters parameters;
+
+            unsigned workgroups_x;
+            unsigned workgroups_y;
+            unsigned uv_scale_x;
+            unsigned stride;
+            Program *program;
+        };
+
+        double cost = 0.0;
+
+        std::unique_ptr<Buffer> temp_buffer;
+        std::unique_ptr<Buffer> temp_buffer_image;
+        std::vector<Pass> passes;
+        std::shared_ptr<ProgramCache> cache;
+
+        std::unique_ptr<Program> build_program(const Parameters &params);
+        static std::string load_shader_string(const char *path);
+        static void store_shader_string(const char *path, const std::string &source);
+
+        Program* get_program(const Parameters &params);
+
+        struct
+        {
+            float offset_x = 0.0f, offset_y = 0.0f, scale_x = 1.0f, scale_y = 1.0f;
+            Sampler *samplers[2] = { nullptr, nullptr };
+        } texture;
+
+        struct
+        {
+            struct
+            {
+                size_t offset = 0;
+                size_t size = 0;
+            } input, input_aux, output;
+        } ssbo;
+        unsigned size_x, size_y;
+};
+
+}
+
+#endif
diff --git a/glfft/glfft_common.hpp b/glfft/glfft_common.hpp
new file mode 100644
index 0000000..4065f68
--- /dev/null
+++ b/glfft/glfft_common.hpp
@@ -0,0 +1,178 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// For the most part used by the implementation.
+
+#ifndef GLFFT_COMMON_HPP__
+#define GLFFT_COMMON_HPP__
+
+#include "glfft_interface.hpp"
+#include <functional>
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
+
+namespace GLFFT
+{
+
+enum Direction
+{
+    /// Forward FFT transform.
+    Forward = -1,
+    /// Inverse FFT transform, but with two inputs (in frequency domain) which are multiplied together
+    /// for convolution.
+    InverseConvolve = 0,
+    /// Inverse FFT transform.
+    Inverse = 1
+};
+
+enum Mode
+{
+    Horizontal,
+    HorizontalDual,
+    Vertical,
+    VerticalDual,
+
+    ResolveRealToComplex,
+    ResolveComplexToReal,
+};
+
+enum Type
+{
+    /// Regular complex-to-complex transform.
+    ComplexToComplex,
+    /// Complex-to-complex dual transform where the complex value is four-dimensional,
+    /// i.e. a vector of two complex values. Typically used to transform RGBA data.
+    ComplexToComplexDual,
+    /// Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex samples.
+    ComplexToReal,
+    /// Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N complex samples.
+    RealToComplex
+};
+
+enum Target
+{
+    /// GL_SHADER_STORAGE_BUFFER
+    SSBO,
+    /// Textures, when used as output, type is determined by transform type.
+    /// ComplexToComplex / RealToComplex -> GL_RG16F
+    /// ComplexToComplexDual -> GL_RGBA16F
+    Image,
+    /// Real-valued (single component) textures, when used as output, type is determined by transform type.
+    /// ComplexToReal -> GL_R32F (because GLES 3.1 doesn't have GL_R16F image type).
+    ImageReal
+};
+
+struct Parameters
+{
+    unsigned workgroup_size_x;
+    unsigned workgroup_size_y;
+    unsigned workgroup_size_z;
+    unsigned radix;
+    unsigned vector_size;
+    Direction direction;
+    Mode mode;
+    Target input_target;
+    Target output_target;
+    bool p1;
+    bool shared_banked;
+    bool fft_fp16, input_fp16, output_fp16;
+    bool fft_normalize;
+
+    bool operator==(const Parameters &other) const
+    {
+        return std::memcmp(this, &other, sizeof(Parameters)) == 0;
+    }
+};
+
+/// @brief Options for FFT implementation.
+/// Defaults for performance as conservative.
+struct FFTOptions
+{
+    struct Performance
+    {
+        /// Workgroup size used in layout(local_size_x).
+        /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
+        /// FFT constructor will throw in this case.
+        unsigned workgroup_size_x = 4;
+        /// Workgroup size used in layout(local_size_x).
+        /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
+        /// FFT constructor will throw in this case.
+        unsigned workgroup_size_y = 1;
+        /// Vector size. Very GPU dependent. "Scalar" GPUs prefer 2 here, vector GPUs prefer 4 (and maybe 8).
+        unsigned vector_size = 2;
+        /// Whether to use banked shared memory or not.
+        /// Desktop GPUs prefer true here, false for mobile in general.
+        bool shared_banked = false;
+    } performance;
+
+    struct Type
+    {
+        /// Whether internal shader should be mediump float.
+        bool fp16 = false;
+        /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
+        bool input_fp16 = false;
+        /// Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
+        bool output_fp16 = false;
+        /// Whether to apply 1 / N normalization factor.
+        bool normalize = false;
+    } type;
+};
+
+}
+
+namespace std
+{
+    template<>
+    struct hash<GLFFT::Parameters>
+    {
+        std::size_t operator()(const GLFFT::Parameters &params) const
+        {
+            std::size_t h = 0;
+            hash<uint8_t> hasher;
+            for (std::size_t i = 0; i < sizeof(GLFFT::Parameters); i++)
+            {
+                h ^= hasher(reinterpret_cast<const uint8_t*>(&params)[i]);
+            }
+
+            return h;
+        }
+    };
+}
+
+namespace GLFFT
+{
+
+class ProgramCache
+{
+    public:
+        Program* find_program(const Parameters &parameters) const;
+        void insert_program(const Parameters &parameters, std::unique_ptr<Program> program);
+        size_t cache_size() const { return programs.size(); }
+
+    private:
+        std::unordered_map<Parameters, std::unique_ptr<Program>> programs;
+};
+
+}
+
+#endif
+
diff --git a/glfft/glfft_gl_api_headers.hpp b/glfft/glfft_gl_api_headers.hpp
new file mode 100644
index 0000000..d4714e0
--- /dev/null
+++ b/glfft/glfft_gl_api_headers.hpp
@@ -0,0 +1,6 @@
+
+/* Let GLFFT use GLava's headers */
+#define GLFFT_GLSL_LANG_STRING "#version 430 core\n"
+extern "C" {
+    #include "../glava/glad.h"
+}
diff --git a/glfft/glfft_gl_interface.cpp b/glfft/glfft_gl_interface.cpp
new file mode 100644
index 0000000..47e7ef4
--- /dev/null
+++ b/glfft/glfft_gl_interface.cpp
@@ -0,0 +1,310 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "glfft_gl_interface.hpp"
+#ifdef GLFFT_GL_DEBUG
+#include "glfft_validate.hpp"
+#endif
+#include <cstdarg>
+#include <cstring>
+#include <vector>
+
+using namespace GLFFT;
+using namespace std;
+
+GLCommandBuffer GLContext::static_command_buffer;
+
+void GLCommandBuffer::bind_program(Program *program)
+{
+    glUseProgram(program ? static_cast<GLProgram*>(program)->name : 0);
+}
+
+void GLCommandBuffer::bind_storage_texture(unsigned binding, Texture *texture, Format format)
+{
+    glBindImageTexture(binding, static_cast<GLTexture*>(texture)->name,
+            0, GL_FALSE, 0, GL_WRITE_ONLY, convert(format));
+}
+
+void GLCommandBuffer::bind_texture(unsigned binding, Texture *texture)
+{
+    glActiveTexture(GL_TEXTURE0 + binding);
+    glBindTexture(GL_TEXTURE_2D, static_cast<GLTexture*>(texture)->name);
+}
+
+void GLCommandBuffer::bind_sampler(unsigned binding, Sampler *sampler)
+{
+    glBindSampler(binding, sampler ? static_cast<GLSampler*>(sampler)->name : 0);
+}
+
+void GLCommandBuffer::bind_storage_buffer(unsigned binding, Buffer *buffer)
+{
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name);
+}
+
+void GLCommandBuffer::bind_storage_buffer_range(unsigned binding, size_t offset, size_t size, Buffer *buffer)
+{
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name, offset, size);
+}
+
+void GLCommandBuffer::dispatch(unsigned x, unsigned y, unsigned z)
+{
+    glDispatchCompute(x, y, z);
+}
+
+void GLCommandBuffer::barrier(Buffer*)
+{
+    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+}
+
+void GLCommandBuffer::barrier(Texture*)
+{
+    glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
+}
+
+void GLCommandBuffer::barrier()
+{
+    glMemoryBarrier(GL_ALL_BARRIER_BITS);
+}
+
+void GLCommandBuffer::push_constant_data(unsigned binding, const void *data, size_t size)
+{
+    glBindBufferBase(GL_UNIFORM_BUFFER, binding, ubos[ubo_index]);
+    void *ptr = glMapBufferRange(GL_UNIFORM_BUFFER,
+            0, CommandBuffer::MaxConstantDataSize,
+            GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+
+    if (ptr)
+    {
+        std::memcpy(ptr, data, size);
+        glUnmapBuffer(GL_UNIFORM_BUFFER);
+    }
+
+    if (++ubo_index >= ubo_count)
+        ubo_index = 0;
+}
+
+CommandBuffer* GLContext::request_command_buffer()
+{
+    if (!initialized_ubos)
+    {
+        glGenBuffers(MaxBuffersRing, ubos);
+        for (auto &ubo : ubos)
+        {
+            glBindBuffer(GL_UNIFORM_BUFFER, ubo);
+            glBufferData(GL_UNIFORM_BUFFER, CommandBuffer::MaxConstantDataSize, nullptr, GL_STREAM_DRAW);
+        }
+        static_command_buffer.set_constant_data_buffers(ubos, MaxBuffersRing);
+        initialized_ubos = true;
+    }
+    return &static_command_buffer;
+}
+
+void GLContext::submit_command_buffer(CommandBuffer*)
+{}
+
+void GLContext::wait_idle()
+{
+    glFinish();
+}
+
+unique_ptr<Texture> GLContext::create_texture(const void *initial_data,
+        unsigned width, unsigned height,
+        Format format)
+{
+    return unique_ptr<Texture>(new GLTexture(initial_data, width, height, format));
+}
+
+unique_ptr<Buffer> GLContext::create_buffer(const void *initial_data, size_t size, AccessMode access)
+{
+    return unique_ptr<Buffer>(new GLBuffer(initial_data, size, access));
+}
+
+unique_ptr<Program> GLContext::compile_compute_shader(const char *source)
+{
+#ifdef GLFFT_GL_DEBUG
+    if (!validate_glsl_source(source))
+        return nullptr;
+#endif
+
+    GLuint program = glCreateProgram();
+    if (!program)
+    {
+        return nullptr;
+    }
+
+    GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
+
+    const char *sources[] = { GLFFT_GLSL_LANG_STRING, source };
+    glShaderSource(shader, 2, sources, NULL);
+    glCompileShader(shader);
+
+    GLint status;
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
+    if (status == GL_FALSE)
+    {
+        GLint len;
+        GLsizei out_len;
+
+        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len);
+        vector<char> buf(len);
+        glGetShaderInfoLog(shader, len, &out_len, buf.data());
+        log("GLFFT: Shader log:\n%s\n\n", buf.data());
+
+        glDeleteShader(shader);
+        glDeleteProgram(program);
+        return 0;
+    }
+
+    glAttachShader(program, shader);
+    glLinkProgram(program);
+    glDeleteShader(shader);
+
+    glGetProgramiv(program, GL_LINK_STATUS, &status);
+    if (status == GL_FALSE)
+    {
+        GLint len;
+        GLsizei out_len;
+        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &len);
+        vector<char> buf(len);
+        glGetProgramInfoLog(program, len, &out_len, buf.data());
+        log("Program log:\n%s\n\n", buf.data());
+
+        glDeleteProgram(program);
+        glDeleteShader(shader);
+        return nullptr;
+    }
+
+    return unique_ptr<Program>(new GLProgram(program));
+}
+
+void GLContext::log(const char *fmt, ...)
+{
+    char buffer[4 * 1024];
+
+    va_list va;
+    va_start(va, fmt);
+    vsnprintf(buffer, sizeof(buffer), fmt, va);
+    va_end(va);
+    glfft_log("%s", buffer);
+}
+
+double GLContext::get_time()
+{
+    return glfft_time();
+}
+
+unsigned GLContext::get_max_work_group_threads()
+{
+    GLint value;
+    glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &value);
+    return value;
+}
+
+const char* GLContext::get_renderer_string()
+{
+    return reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+}
+
+const void* GLContext::map(Buffer *buffer, size_t offset, size_t size)
+{
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
+    const void *ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, size, GL_MAP_READ_BIT);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+    return ptr;
+}
+
+void GLContext::unmap(Buffer *buffer)
+{
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
+    glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+}
+
+void GLContext::teardown()
+{
+    if (initialized_ubos)
+        glDeleteBuffers(MaxBuffersRing, ubos);
+    initialized_ubos = false;
+}
+
+GLContext::~GLContext()
+{
+    teardown();
+}
+
+GLTexture::GLTexture(const void *initial_data,
+        unsigned width, unsigned height,
+        Format format)
+{
+    glGenTextures(1, &name);
+    glBindTexture(GL_TEXTURE_2D, name);
+    glTexStorage2D(GL_TEXTURE_2D, 1, convert(format), width, height);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+    if (initial_data)
+    {
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height,
+                convert_format(format), convert_type(format), initial_data);
+    }
+
+    glBindTexture(GL_TEXTURE_2D, 0);
+}
+
+GLTexture::~GLTexture()
+{
+    if (owned)
+        glDeleteTextures(1, &name);
+}
+
+GLBuffer::GLBuffer(const void *initial_data, size_t size, AccessMode access)
+{
+    glGenBuffers(1, &name);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, name);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, size, initial_data, convert(access));
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+}
+
+GLBuffer::~GLBuffer()
+{
+    if (owned)
+        glDeleteBuffers(1, &name);
+}
+
+GLProgram::GLProgram(GLuint name)
+    : name(name)
+{}
+
+GLProgram::~GLProgram()
+{
+    if (name != 0)
+    {
+        glDeleteProgram(name);
+    }
+}
+
+GLSampler::~GLSampler()
+{
+    if (name != 0)
+    {
+        glDeleteSamplers(1, &name);
+    }
+}
+
diff --git a/glfft/glfft_gl_interface.hpp b/glfft/glfft_gl_interface.hpp
new file mode 100644
index 0000000..43929c7
--- /dev/null
+++ b/glfft/glfft_gl_interface.hpp
@@ -0,0 +1,258 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_GL_INTERFACE_HPP__
+#define GLFFT_GL_INTERFACE_HPP__
+
+#include "glfft_interface.hpp"
+
+#include "glfft_gl_api_headers.hpp"
+
+/* GLava additions (POSIX) */
+extern "C" {
+    #include <time.h>
+    #include <stdarg.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <error.h>
+}
+
+#ifndef GLFFT_GLSL_LANG_STRING
+#error GLFFT_GLSL_LANG_STRING must be defined to e.g. "#version 310 es\n" or "#version 430 core\n".
+#endif
+
+#ifndef GLFFT_LOG_OVERRIDE
+void glfft_log(const char *fmt, ...) {
+    va_list l;
+    va_start(l, fmt);
+    vfprintf(stdout, fmt, l);
+    va_end(l);
+}
+#else
+#define glfft_log GLFFT_LOG_OVERRIDE
+#endif
+
+#ifndef GLFFT_TIME_OVERRIDE
+double glfft_time() {
+    struct timespec tv;
+    if (clock_gettime(CLOCK_REALTIME, &tv)) {
+        fprintf(stderr, "clock_gettime(CLOCK_REALTIME, ...): %s\n", strerror(errno));
+    }
+    return (double) tv.tv_sec + ((double) tv.tv_nsec / 1000000000.0);
+}
+#else
+#define glfft_time GLFFT_TIME_OVERRIDE
+#endif
+
+namespace GLFFT
+{
+    class GLContext;
+
+    class GLTexture : public Texture
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLTexture();
+
+            GLTexture(GLuint obj) : name(obj), owned(false) {}
+            GLuint get() const { return name; }
+
+        private:
+            GLTexture(const void *initial_data,
+                    unsigned width, unsigned height,
+                    Format format);
+            GLuint name;
+            bool owned = true;
+    };
+
+    // Not really used by test and bench code, but can be useful for API users.
+    class GLSampler : public Sampler
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLSampler();
+
+            GLSampler(GLuint obj) : name(obj) {}
+            GLuint get() const { return name; }
+
+        private:
+            GLuint name;
+    };
+
+    class GLBuffer : public Buffer
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLBuffer();
+
+            GLBuffer(GLuint obj) : name(obj), owned(false) {}
+            GLuint get() const { return name; }
+
+        private:
+            GLuint name;
+            GLBuffer(const void *initial_data, size_t size, AccessMode access);
+            bool owned = true;
+    };
+
+    class GLProgram : public Program
+    {
+        public:
+            friend class GLContext;
+            friend class GLCommandBuffer;
+            ~GLProgram();
+
+            GLuint get() const { return name; }
+
+        private:
+            GLProgram(GLuint name);
+            GLuint name;
+    };
+
+    class GLCommandBuffer : public CommandBuffer
+    {
+        public:
+            ~GLCommandBuffer() = default;
+
+            void set_constant_data_buffers(const GLuint *ubos, unsigned count)
+            {
+                this->ubos = ubos;
+                ubo_index = 0;
+                ubo_count = count;
+            }
+
+            void bind_program(Program *program) override;
+            void bind_storage_texture(unsigned binding, Texture *texture, Format format) override;
+            void bind_texture(unsigned binding, Texture *texture) override;
+            void bind_sampler(unsigned binding, Sampler *sampler) override;
+            void bind_storage_buffer(unsigned binding, Buffer *texture) override;
+            void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) override;
+            void dispatch(unsigned x, unsigned y, unsigned z) override;
+
+            void barrier(Buffer *buffer) override;
+            void barrier(Texture *buffer) override;
+            void barrier() override;
+
+            void push_constant_data(unsigned binding, const void *data, size_t size) override;
+
+        private:
+            const GLuint *ubos = nullptr;
+            unsigned ubo_count = 0;
+            unsigned ubo_index = 0;
+    };
+
+    class GLContext : public Context
+    {
+        public:
+            ~GLContext();
+
+            std::unique_ptr<Texture> create_texture(const void *initial_data,
+                    unsigned width, unsigned height,
+                    Format format) override;
+
+            std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) override;
+            std::unique_ptr<Program> compile_compute_shader(const char *source) override;
+
+            CommandBuffer* request_command_buffer() override;
+            void submit_command_buffer(CommandBuffer *cmd) override;
+            void wait_idle() override;
+
+            const char* get_renderer_string() override;
+            void log(const char *fmt, ...) override;
+            double get_time() override;
+
+            unsigned get_max_work_group_threads() override;
+
+            const void* map(Buffer *buffer, size_t offset, size_t size) override;
+            void unmap(Buffer *buffer) override;
+
+            // Not supported in GLES, so override when creating platform-specific context.
+            bool supports_texture_readback() override { return false; }
+            void read_texture(void*, Texture*, Format) override {}
+
+        protected:
+            void teardown();
+
+        private:
+            static GLCommandBuffer static_command_buffer;
+
+            enum { MaxBuffersRing = 256 };
+            GLuint ubos[MaxBuffersRing];
+            bool initialized_ubos = false;
+    };
+
+    static inline GLenum convert(AccessMode mode)
+    {
+        switch (mode)
+        {
+            case AccessStreamCopy: return GL_STREAM_COPY;
+            case AccessStaticCopy: return GL_STATIC_COPY;
+            case AccessStreamRead: return GL_STREAM_READ;
+        }
+        return 0;
+    }
+
+    static inline GLenum convert(Format format)
+    {
+        switch (format)
+        {
+            case FormatR16G16B16A16Float: return GL_RGBA16F;
+            case FormatR32G32B32A32Float: return GL_RGBA32F;
+            case FormatR32Float: return GL_R32F;
+            case FormatR16G16Float: return GL_RG16F;
+            case FormatR32G32Float: return GL_RG32F;
+            case FormatR32Uint: return GL_R32UI;
+            case FormatUnknown: return 0;
+        }
+        return 0;
+    }
+
+    static inline GLenum convert_format(Format format)
+    {
+        switch (format)
+        {
+            case FormatR16G16Float: return GL_RG;
+            case FormatR32G32Float: return GL_RG;
+            case FormatR16G16B16A16Float: return GL_RGBA;
+            case FormatR32G32B32A32Float: return GL_RGBA;
+            case FormatR32Float: return GL_RED;
+            case FormatR32Uint: return GL_RED_INTEGER;
+            case FormatUnknown: return 0;
+        }
+        return 0;
+    }
+
+    static inline GLenum convert_type(Format format)
+    {
+        switch (format)
+        {
+            case FormatR16G16Float: return GL_HALF_FLOAT;
+            case FormatR16G16B16A16Float: return GL_HALF_FLOAT;
+            case FormatR32Float: return GL_FLOAT;
+            case FormatR32G32Float: return GL_FLOAT;
+            case FormatR32G32B32A32Float: return GL_FLOAT;
+            case FormatR32Uint: return GL_UNSIGNED_INT;
+            case FormatUnknown: return 0;
+        }
+        return 0;
+    }
+}
+
+#endif
diff --git a/glfft/glfft_interface.hpp b/glfft/glfft_interface.hpp
new file mode 100644
index 0000000..c2cc909
--- /dev/null
+++ b/glfft/glfft_interface.hpp
@@ -0,0 +1,131 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_INTERFACE_HPP__
+#define GLFFT_INTERFACE_HPP__
+
+#include <memory>
+
+namespace GLFFT
+{
+    class Context;
+
+    class Resource
+    {
+        public:
+            virtual ~Resource() = default;
+
+            // Non-movable, non-copyable to make things simpler.
+            Resource(Resource&&) = delete;
+            void operator=(const Resource&) = delete;
+
+        protected:
+            Resource() = default;
+    };
+
+    class Texture : public Resource {};
+    class Sampler : public Resource {};
+    class Buffer : public Resource {};
+
+    class Program
+    {
+        public:
+            virtual ~Program() = default;
+        protected:
+            friend class Context;
+            Program() = default;
+    };
+
+    enum AccessMode
+    {
+        AccessStreamCopy,
+        AccessStaticCopy,
+        AccessStreamRead
+    };
+
+    enum Format
+    {
+        FormatUnknown,
+        FormatR16G16B16A16Float,
+        FormatR32G32B32A32Float,
+        FormatR32G32Float,
+        FormatR32Float,
+        FormatR16G16Float,
+        FormatR32Uint
+    };
+
+    class CommandBuffer;
+
+    class Context
+    {
+        public:
+            virtual ~Context() = default;
+
+            virtual std::unique_ptr<Texture> create_texture(const void *initial_data,
+                    unsigned width, unsigned height,
+                    Format format) = 0;
+
+            virtual std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) = 0;
+            virtual std::unique_ptr<Program> compile_compute_shader(const char *source) = 0;
+
+            virtual CommandBuffer* request_command_buffer() = 0;
+            virtual void submit_command_buffer(CommandBuffer *cmd) = 0;
+            virtual void wait_idle() = 0;
+
+            virtual const char* get_renderer_string() = 0;
+            virtual void log(const char *fmt, ...) = 0;
+            virtual double get_time() = 0;
+
+            virtual unsigned get_max_work_group_threads() = 0;
+
+            virtual const void* map(Buffer *buffer, size_t offset, size_t size) = 0;
+            virtual void unmap(Buffer *buffer) = 0;
+
+            virtual bool supports_texture_readback() = 0;
+            virtual void read_texture(void *buffer, Texture *texture, Format format) = 0;
+
+        protected:
+            Context() = default;
+    };
+
+    class CommandBuffer
+    {
+        public:
+            virtual ~CommandBuffer() = default;
+
+            virtual void bind_program(Program *program) = 0;
+            virtual void bind_storage_texture(unsigned binding, Texture *texture, Format format) = 0;
+            virtual void bind_texture(unsigned binding, Texture *texture) = 0;
+            virtual void bind_sampler(unsigned binding, Sampler *sampler) = 0;
+            virtual void bind_storage_buffer(unsigned binding, Buffer *texture) = 0;
+            virtual void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) = 0;
+            virtual void dispatch(unsigned x, unsigned y, unsigned z) = 0;
+
+            virtual void barrier(Buffer *buffer) = 0;
+            virtual void barrier(Texture *buffer) = 0;
+            virtual void barrier() = 0;
+
+            enum { MaxConstantDataSize = 64 };
+            virtual void push_constant_data(unsigned binding, const void *data, size_t size) = 0;
+
+        protected:
+            CommandBuffer() = default;
+    };
+}
+
+#endif
diff --git a/glfft/glfft_wisdom.cpp b/glfft/glfft_wisdom.cpp
new file mode 100644
index 0000000..ad18317
--- /dev/null
+++ b/glfft/glfft_wisdom.cpp
@@ -0,0 +1,600 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "glfft_wisdom.hpp"
+#include "glfft_interface.hpp"
+#include "glfft.hpp"
+#include <utility>
+
+/* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */
+#ifdef GLFFT_SERIALIZATION
+#include "rapidjson/reader.h"
+#include "rapidjson/prettywriter.h"
+#include "rapidjson/stringbuffer.h"
+#include "rapidjson/document.h"
+using namespace rapidjson;
+#endif
+
+#ifdef GLFFT_CLI_ASYNC
+#include "glfft_cli.hpp"
+#endif
+
+using namespace std;
+using namespace GLFFT;
+
+FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context)
+{
+    FFTStaticWisdom res;
+
+    const char *renderer = context->get_renderer_string();
+    unsigned threads = context->get_max_work_group_threads();
+
+    if (strstr(renderer, "GeForce") || strstr(renderer, "Quadro"))
+    {
+        context->log("Detected GeForce/Quadro GPU.\n");
+        res.min_workgroup_size = 32; // Warp threads.
+        res.min_workgroup_size_shared = 32;
+        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
+        res.min_vector_size = 2;
+        res.max_vector_size = 2;
+        res.shared_banked = FFTStaticWisdom::True;
+    }
+    else if (strstr(renderer, "Radeon"))
+    {
+        context->log("Detected Radeon GPU.\n");
+        res.min_workgroup_size = 64; // Wavefront threads (GCN).
+        res.min_workgroup_size_shared = 128;
+        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
+        // TODO: Find if we can restrict this to 2 or 4 always.
+        res.min_vector_size = 2;
+        res.max_vector_size = 4;
+        res.shared_banked = FFTStaticWisdom::True;
+    }
+    else if (strstr(renderer, "Mali"))
+    {
+        context->log("Detected Mali GPU.\n");
+
+        res.min_workgroup_size = 4;
+        res.min_workgroup_size_shared = 4;
+        res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.
+        res.min_vector_size = 4;
+        res.max_vector_size = 4;
+        res.shared_banked = FFTStaticWisdom::False;
+    }
+    // TODO: Add more GPUs.
+
+    return res;
+}
+
+pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(
+        Context *context, unsigned Nx, unsigned Ny, unsigned radix,
+        Mode mode, Target input_target, Target output_target,
+        const FFTOptions::Type &type)
+{
+    WisdomPass pass = {
+        {
+            Nx, Ny, radix, mode, input_target, output_target,
+            type,
+        },
+        0.0,
+    };
+
+    auto itr = library.find(pass);
+    if (itr != end(library))
+    {
+        return make_pair(itr->first.cost, itr->second);
+    }
+    else
+    {
+        auto result = study(context, pass, type);
+        pass.cost = result.first;
+        library[pass] = result.second;
+
+        return result;
+    }
+}
+
+void FFTWisdom::learn_optimal_options_exhaustive(Context *context,
+        unsigned Nx, unsigned Ny,
+        Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)
+{
+    bool learn_resolve = type == ComplexToReal || type == RealToComplex;
+    Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;
+    Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
+
+    // Create wisdom for horizontal transforms and vertical transform.
+    static const unsigned radices[] = { 4, 8, 16, 64 };
+    for (auto radix : radices)
+    {
+        try
+        {
+            // If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.
+
+            // Learn plain transforms.
+            if (Ny > 1)
+            {
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);
+            }
+            learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);
+
+            // Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with
+            // block interleave and larger WG_Y sizes.
+            if (input_target != SSBO)
+            {
+                if (Ny > 1)
+                {
+                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);
+                }
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);
+            }
+
+            if (output_target != SSBO)
+            {
+                if (Ny > 1)
+                {
+                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);
+                }
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);
+            }
+        }
+#ifdef GLFFT_CLI_ASYNC
+        catch (const AsyncCancellation &)
+        {
+            throw;
+        }
+#endif
+        catch (...)
+        {
+            // If our default options cannot successfully create the radix pass (i.e. throws),
+            // just ignore it for purpose of creating wisdom.
+        }
+    }
+
+    auto resolve_type = fft_type;
+    resolve_type.input_fp16 = resolve_type.output_fp16;
+    Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
+    Target resolve_input_target = SSBO;
+
+    // If we have C2R Nx1 transform, the first pass is resolve, so use those types.
+    if (type == ComplexToReal && Ny == 1)
+    {
+        resolve_type = fft_type;
+        resolve_input_target = input_target;
+    }
+
+    // If we need to do a resolve pass, train this case as well.
+    if (learn_resolve)
+    {
+        try
+        {
+            // If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.
+            if (Ny == 1 && resolve_mode == ResolveRealToComplex)
+            {
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
+            }
+            else
+            {
+                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);
+            }
+        }
+#ifdef GLFFT_CLI_ASYNC
+        catch (const AsyncCancellation &)
+        {
+            throw;
+        }
+#endif
+        catch (...)
+        {
+            // If our default options cannot successfully create the radix pass (i.e. throws),
+            // just ignore it for purpose of creating wisdom.
+        }
+    }
+}
+
+double FFTWisdom::bench(Context *context, Resource *output, Resource *input,
+        const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const
+{
+    FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,
+            pass.pass.mode, pass.pass.input_target, pass.pass.output_target,
+            cache, options);
+
+    return fft.bench(context,
+            output, input, params.warmup, params.iterations, params.dispatches, params.timeout);
+}
+
+static inline unsigned mode_to_size(Mode mode)
+{
+    switch (mode)
+    {
+        case VerticalDual:
+        case HorizontalDual:
+        case ResolveRealToComplex:
+        case ResolveComplexToReal:
+            return 4;
+
+        default:
+            return 2;
+    }
+}
+
+std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const
+{
+    auto cache = make_shared<ProgramCache>();
+
+    unique_ptr<Resource> output;
+    unique_ptr<Resource> input;
+
+    unsigned mode_size = mode_to_size(pass.pass.mode);
+    vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);
+
+    if (pass.pass.input_target == SSBO)
+    {
+        input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy);
+    }
+    else
+    {
+        Format format = FormatUnknown;
+        unsigned Nx = pass.pass.Nx;
+        unsigned Ny = pass.pass.Ny;
+
+        switch (pass.pass.mode)
+        {
+            case VerticalDual:
+            case HorizontalDual:
+                format = FormatR32G32B32A32Float;
+                break;
+
+            case Vertical:
+            case Horizontal:
+                format = FormatR32G32Float;
+                break;
+
+            case ResolveComplexToReal:
+                format = FormatR32G32Float;
+                Nx *= 2;
+                break;
+
+            default:
+                throw logic_error("Invalid input mode.\n");
+        }
+
+        input = context->create_texture(tmp.data(), Nx, Ny, format);
+    }
+
+    if (pass.pass.output_target == SSBO)
+    {
+        output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy);
+    }
+    else
+    {
+        Format format = FormatUnknown;
+        unsigned Nx = pass.pass.Nx;
+        unsigned Ny = pass.pass.Ny;
+
+        switch (pass.pass.mode)
+        {
+            case VerticalDual:
+            case HorizontalDual:
+                format = FormatR32G32B32A32Float;
+                break;
+
+            case Vertical:
+            case Horizontal:
+                format = FormatR32G32Float;
+                break;
+
+            case ResolveRealToComplex:
+                format = FormatR32G32Float;
+                Nx *= 2;
+                break;
+
+            default:
+                throw logic_error("Invalid output mode.\n");
+        }
+
+        output = context->create_texture(nullptr, Nx, Ny, format);
+    }
+
+    // Exhaustive search, look for every sensible combination, and find fastest parameters.
+    // Get initial best cost with defaults.
+    FFTOptions::Performance best_perf;
+    double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache);
+
+    static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };
+    static const unsigned vector_size_values[] = { 2, 4, 8 };
+    static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
+    static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };
+
+    bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex;
+    bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual;
+    unsigned bench_count = 0;
+
+    for (auto shared_banked : shared_banked_values)
+    {
+        // Useless test, since shared banked is only relevant for radix 16/64.
+        if (pass.pass.radix < 16 && shared_banked)
+        {
+            continue;
+        }
+
+        bool fair_shared_banked = (pass.pass.radix < 16) ||
+                                  (static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
+                                  (shared_banked == static_wisdom.shared_banked);
+
+        if (!fair_shared_banked)
+        {
+            continue;
+        }
+
+        for (auto vector_size : vector_size_values)
+        {
+            // Resolve passes currently only support vector size 2. Shared banked makes no sense either.
+            if (test_resolve && (vector_size != 2 || shared_banked))
+            {
+                continue;
+            }
+
+            // We can only use vector_size 8 with FP16.
+            if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16))
+            {
+                continue;
+            }
+
+            // Makes little sense to test since since vector_size will be bumped to 4 anyways.
+            if (test_dual && vector_size < 4)
+            {
+                continue;
+            }
+
+            for (auto workgroup_size_x : workgroup_size_x_values)
+            {
+                for (auto workgroup_size_y : workgroup_size_y_values)
+                {
+                    unsigned workgroup_size  = workgroup_size_x * workgroup_size_y;
+
+                    unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :
+                                                                          static_wisdom.min_workgroup_size;
+
+                    unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
+                    unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;
+
+                    bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
+                                               workgroup_size >= min_workgroup_size;
+                    if (pass.pass.Ny == 1 && workgroup_size_y > 1)
+                    {
+                        fair_workgroup_size = false;
+                    }
+
+                    if (!fair_workgroup_size)
+                    {
+                        continue;
+                    }
+
+                    // If we have dual mode, accept vector sizes larger than max.
+                    bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
+                                                             vector_size >= min_vector_size);
+
+                    if (!fair_vector_size)
+                    {
+                        continue;
+                    }
+
+                    FFTOptions::Performance perf;
+                    perf.shared_banked = shared_banked;
+                    perf.vector_size = vector_size;
+                    perf.workgroup_size_x = workgroup_size_x;
+                    perf.workgroup_size_y = workgroup_size_y;
+
+                    try
+                    {
+                        // If workgroup sizes are too big for our test, this will throw.
+                        double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache);
+                        bench_count++;
+
+#if 1
+                        context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);
+                        context->log("  Width:            %4u\n", pass.pass.Nx);
+                        context->log("  Height:           %4u\n", pass.pass.Ny);
+                        context->log("  Shared banked:     %3s\n", shared_banked ? "yes" : "no");
+                        context->log("  Vector size:         %u\n", vector_size);
+                        context->log("  Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
+                        context->log("  Cost:         %8.3g\n", cost);
+#endif
+
+                        if (cost < minimum_cost)
+                        {
+#if 1
+                            context->log("  New optimal solution! (%g -> %g)\n", minimum_cost, cost);
+#endif
+                            best_perf = perf;
+                            minimum_cost = cost;
+                        }
+                    }
+#ifdef GLFFT_CLI_ASYNC
+                    catch (const AsyncCancellation &)
+                    {
+                        throw;
+                    }
+#endif
+                    catch (...)
+                    {
+                        // If we pass in bogus parameters,
+                        // FFT will throw and we just ignore this.
+                    }
+                }
+            }
+        }
+    }
+
+    context->log("Tested %u variants!\n", bench_count);
+    return make_pair(minimum_cost, best_perf);
+}
+
+const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
+        Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const
+{
+    WisdomPass pass = {
+        {
+            Nx, Ny, radix, mode, input_target, output_target,
+            type,
+        },
+        0.0,
+    };
+
+    auto itr = library.find(pass);
+    return itr != end(library) ? (&(*itr)) : nullptr;
+}
+
+const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
+        Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
+{
+    WisdomPass pass = {
+        {
+            Nx, Ny, radix, mode, input_target, output_target,
+            base_options.type,
+        },
+        0.0,
+    };
+
+    auto itr = library.find(pass);
+
+#if 0
+    if (itr == end(library))
+    {
+        context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
+                Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));
+    }
+#endif
+
+    return itr != end(library) ? itr->second : base_options.performance;
+}
+
+#ifdef GLFFT_SERIALIZATION
+std::string FFTWisdom::archive() const
+{
+    StringBuffer s;
+    PrettyWriter<StringBuffer> writer{s};
+
+    writer.StartObject();
+    writer.String("library");
+
+    // Serialize all wisdom accumulated to a string.
+    writer.StartArray();
+    for (auto &entry : library)
+    {
+        writer.StartObject();
+
+        writer.String("scenario");
+        writer.StartObject();
+        writer.String("nx");
+        writer.Uint(entry.first.pass.Nx);
+        writer.String("ny");
+        writer.Uint(entry.first.pass.Ny);
+        writer.String("radix");
+        writer.Uint(entry.first.pass.radix);
+        writer.String("mode");
+        writer.Uint(entry.first.pass.mode);
+        writer.String("input_target");
+        writer.Uint(entry.first.pass.input_target);
+        writer.String("output_target");
+        writer.Uint(entry.first.pass.output_target);
+        writer.EndObject();
+
+        writer.String("type");
+        writer.StartObject();
+        writer.String("fp16");
+        writer.Bool(entry.first.pass.type.fp16);
+        writer.String("input_fp16");
+        writer.Bool(entry.first.pass.type.input_fp16);
+        writer.String("output_fp16");
+        writer.Bool(entry.first.pass.type.output_fp16);
+        writer.String("normalize");
+        writer.Bool(entry.first.pass.type.normalize);
+        writer.EndObject();
+
+        writer.String("performance");
+        writer.StartObject();
+        writer.String("shared_banked");
+        writer.Bool(entry.second.shared_banked);
+        writer.String("vector_size");
+        writer.Uint(entry.second.vector_size);
+        writer.String("workgroup_size_x");
+        writer.Uint(entry.second.workgroup_size_x);
+        writer.String("workgroup_size_y");
+        writer.Uint(entry.second.workgroup_size_y);
+        writer.EndObject();
+
+        writer.String("cost");
+        writer.Double(entry.first.cost);
+
+        writer.EndObject();
+    }
+    writer.EndArray();
+    writer.EndObject();
+    return s.GetString();
+}
+
+void FFTWisdom::extract(const char *json)
+{
+    Document document;
+    document.Parse(json);
+
+    // Exception safe, we don't want to risk throwing in the middle of the
+    // loop, leaving the library is broken state.
+    unordered_map<WisdomPass, FFTOptions::Performance> new_library;
+
+    auto &lib = document["library"];
+
+    // y u no begin(), end() :(
+    for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr)
+    {
+        auto &v = *itr;
+
+        WisdomPass pass;
+        FFTOptions::Performance perf;
+
+        pass.cost = v["cost"].GetDouble();
+
+        auto &scenario = v["scenario"];
+        pass.pass.Nx = scenario["nx"].GetUint();
+        pass.pass.Ny = scenario["ny"].GetUint();
+        pass.pass.radix = scenario["radix"].GetUint();
+        pass.pass.mode = static_cast<Mode>(scenario["mode"].GetUint());
+        pass.pass.input_target = static_cast<Target>(scenario["input_target"].GetUint());
+        pass.pass.output_target = static_cast<Target>(scenario["output_target"].GetUint());
+
+        auto &type = v["type"];
+        pass.pass.type.fp16 = type["fp16"].GetBool();
+        pass.pass.type.input_fp16 = type["input_fp16"].GetBool();
+        pass.pass.type.output_fp16 = type["output_fp16"].GetBool();
+        pass.pass.type.normalize = type["normalize"].GetBool();
+
+        auto &performance = v["performance"];
+        perf.shared_banked = performance["shared_banked"].GetBool();
+        perf.vector_size = performance["vector_size"].GetUint();
+        perf.workgroup_size_x = performance["workgroup_size_x"].GetUint();
+        perf.workgroup_size_y = performance["workgroup_size_y"].GetUint();
+
+        new_library[pass] = perf;
+    }
+
+    // Exception safe.
+    swap(library, new_library);
+}
+#endif
+
diff --git a/glfft/glfft_wisdom.hpp b/glfft/glfft_wisdom.hpp
new file mode 100644
index 0000000..d2a11ee
--- /dev/null
+++ b/glfft/glfft_wisdom.hpp
@@ -0,0 +1,149 @@
+/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLFFT_WISDOM_HPP__
+#define GLFFT_WISDOM_HPP__
+
+#include <unordered_map>
+#include <utility>
+#include <string>
+#include "glfft_common.hpp"
+#include "glfft_interface.hpp"
+
+namespace GLFFT
+{
+
+struct WisdomPass
+{
+    struct
+    {
+        unsigned Nx;
+        unsigned Ny;
+        unsigned radix;
+        Mode mode;
+        Target input_target;
+        Target output_target;
+        FFTOptions::Type type;
+    } pass;
+
+    double cost;
+
+    bool operator==(const WisdomPass &other) const
+    {
+        return std::memcmp(&pass, &other.pass, sizeof(pass)) == 0;
+    }
+};
+
+}
+
+namespace std
+{
+    template<>
+    struct hash<GLFFT::WisdomPass>
+    {
+        std::size_t operator()(const GLFFT::WisdomPass &params) const
+        {
+            std::size_t h = 0;
+            hash<uint8_t> hasher;
+            for (std::size_t i = 0; i < sizeof(params.pass); i++)
+            {
+                h ^= hasher(reinterpret_cast<const uint8_t*>(&params.pass)[i]);
+            }
+
+            return h;
+        }
+    };
+}
+
+namespace GLFFT
+{
+
+// Adds information which depends on the GPU vendor.
+// This can speed up learning process, since there will be fewer "obviously wrong" settings to test.
+struct FFTStaticWisdom
+{
+    enum Tristate { True = 1, False = 0, DontCare = -1 };
+
+    unsigned min_workgroup_size = 1;
+    unsigned min_workgroup_size_shared = 1;
+    unsigned max_workgroup_size = 128; // GLES 3.1 mandates support for this.
+    unsigned min_vector_size = 2;
+    unsigned max_vector_size = 4;
+    Tristate shared_banked = DontCare;
+};
+
+class FFTWisdom
+{
+    public:
+        std::pair<double, FFTOptions::Performance> learn_optimal_options(Context *ctx,
+                unsigned Nx, unsigned Ny, unsigned radix,
+                Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type);
+
+        void learn_optimal_options_exhaustive(Context *ctx,
+                unsigned Nx, unsigned Ny,
+                Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type);
+
+        const std::pair<const WisdomPass, FFTOptions::Performance>* find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
+                Mode mode, Target input_target, Target output_target, const FFTOptions::Type &base_options) const;
+
+        const FFTOptions::Performance& find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
+                Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const;
+
+        void set_static_wisdom(FFTStaticWisdom static_wisdom) { this->static_wisdom = static_wisdom; }
+        static FFTStaticWisdom get_static_wisdom_from_renderer(Context *context);
+
+        void set_bench_params(unsigned warmup,
+                unsigned iterations, unsigned dispatches, double timeout)
+        {
+            params.warmup = warmup;
+            params.iterations = iterations;
+            params.dispatches = dispatches;
+            params.timeout = timeout;
+        }
+
+#ifdef GLFFT_SERIALIZATION
+        // Serialization interface.
+        std::string archive() const;
+        void extract(const char *json);
+#endif
+
+    private:
+        std::unordered_map<WisdomPass, FFTOptions::Performance> library;
+
+        std::pair<double, FFTOptions::Performance> study(Context *context,
+                const WisdomPass &pass, FFTOptions::Type options) const;
+
+        double bench(Context *cmd, Resource *output, Resource *input,
+                const WisdomPass &pass, const FFTOptions &options,
+                const std::shared_ptr<ProgramCache> &cache) const;
+
+        FFTStaticWisdom static_wisdom;
+
+        struct
+        {
+            unsigned warmup = 2;
+            unsigned iterations = 20;
+            unsigned dispatches = 50;
+            double timeout = 1.0;
+        } params;
+};
+
+}
+
+#endif
+