From b26f11ae20bbdd6be7e6509f38a82cbdcf4a392b Mon Sep 17 00:00:00 2001 From: Jarcode Date: Wed, 11 Sep 2019 23:06:03 -0700 Subject: [PATCH] Add GLFFT --- glfft/LICENSE_ORIGINAL | 19 + glfft/glfft.cpp | 1125 ++++++++++++++++++++++++++++++++ glfft/glfft.hpp | 225 +++++++ glfft/glfft_common.hpp | 178 +++++ glfft/glfft_gl_api_headers.hpp | 6 + glfft/glfft_gl_interface.cpp | 310 +++++++++ glfft/glfft_gl_interface.hpp | 258 ++++++++ glfft/glfft_interface.hpp | 131 ++++ glfft/glfft_wisdom.cpp | 600 +++++++++++++++++ glfft/glfft_wisdom.hpp | 149 +++++ 10 files changed, 3001 insertions(+) create mode 100644 glfft/LICENSE_ORIGINAL create mode 100644 glfft/glfft.cpp create mode 100644 glfft/glfft.hpp create mode 100644 glfft/glfft_common.hpp create mode 100644 glfft/glfft_gl_api_headers.hpp create mode 100644 glfft/glfft_gl_interface.cpp create mode 100644 glfft/glfft_gl_interface.hpp create mode 100644 glfft/glfft_interface.hpp create mode 100644 glfft/glfft_wisdom.cpp create mode 100644 glfft/glfft_wisdom.hpp diff --git a/glfft/LICENSE_ORIGINAL b/glfft/LICENSE_ORIGINAL new file mode 100644 index 0000000..1d7f765 --- /dev/null +++ b/glfft/LICENSE_ORIGINAL @@ -0,0 +1,19 @@ +Copyright (c) 2015 Hans-Kristian Arntzen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/glfft/glfft.cpp b/glfft/glfft.cpp new file mode 100644 index 0000000..a4b53b5 --- /dev/null +++ b/glfft/glfft.cpp @@ -0,0 +1,1125 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "glfft.hpp" +#include +#include +#include +#include +#include +#include +#include + +#ifdef GLFFT_CLI_ASYNC +#include "glfft_cli.hpp" +#endif + +/* GLava addition: should be defined by meson */ +#if defined(GLAVA_STANDALONE) || !defined(SHADER_INSTALL_PATH) +#undef SHADER_INSTALL_PATH +#define SHADER_INSTALL_PATH "../shaders/glava" +#endif + +using namespace std; +using namespace GLFFT; + +enum Bindings +{ + BindingSSBOIn = 0, + BindingSSBOOut = 1, + BindingSSBOAux = 2, + BindingUBO = 3, + BindingTexture0 = 4, + BindingTexture1 = 5, + BindingImage = 6 +}; + +struct WorkGroupSize +{ + unsigned x, y, z; +}; + +struct Radix +{ + WorkGroupSize size; + unsigned num_workgroups_x; + unsigned num_workgroups_y; + unsigned radix; + unsigned vector_size; + bool shared_banked; +}; + +static unsigned next_pow2(unsigned v) +{ + v--; + v |= v >> 16; + v |= v >> 8; + v |= v >> 4; + v |= v >> 2; + v |= v >> 1; + return v + 1; +} + +static void reduce(unsigned &wg_size, unsigned &divisor) +{ + if (divisor > 1 && wg_size >= divisor) + { + wg_size /= divisor; + divisor = 1; + } + else if (divisor > 1 && wg_size < divisor) + { + divisor /= wg_size; + wg_size = 1; + } +} + +static unsigned radix_to_wg_z(unsigned radix) +{ + switch (radix) + { + case 16: + return 4; + + case 64: + return 8; + + default: + return 1; + } +} + +static Radix build_radix(unsigned Nx, unsigned Ny, + Mode mode, unsigned vector_size, bool shared_banked, unsigned radix, + WorkGroupSize size, + bool pow2_stride) +{ + unsigned wg_x = 0, wg_y = 0; + + if (Ny == 1 && size.y > 1) + { + throw logic_error("WorkGroupSize.y must be 1, when Ny == 1.\n"); + } + + // To avoid too many threads per workgroup due to workgroup_size_z, + // try to divide workgroup_size_y, then workgroup_size_x. + // TODO: Make a better constraint solver which takes into account cache line sizes, + // and image swizzling patterns, etc ... Not that critical though, since wisdom interface + // will find the optimal options despite this. + unsigned divisor = size.z; + reduce(size.y, divisor); + reduce(size.x, divisor); + + switch (mode) + { + case Vertical: + // If we have pow2_stride, we need to transform 2^n + 1 elements horizontally, + // so just add a single workgroup in X. + // We pad by going up to pow2 stride anyways. + // We will transform some garbage, + // but it's better than transforming close to double the amount. + wg_x = (2 * Nx) / (vector_size * size.x) + pow2_stride; + wg_y = Ny / (size.y * radix); + break; + + case VerticalDual: + vector_size = max(vector_size, 4u); + wg_x = (4 * Nx) / (vector_size * size.x); + wg_y = Ny / (size.y * radix); + break; + + case Horizontal: + wg_x = (2 * Nx) / (vector_size * radix * size.x); + wg_y = Ny / size.y; + break; + + case HorizontalDual: + vector_size = max(vector_size, 4u); + wg_x = (4 * Nx) / (vector_size * radix * size.x); + wg_y = Ny / size.y; + break; + + default: + assert(0); + } + + return { size, wg_x, wg_y, radix, vector_size, shared_banked }; +} + +// Resolve radices are simpler, and don't yet support different vector sizes, etc. +static Radix build_resolve_radix(unsigned Nx, unsigned Ny, WorkGroupSize size) +{ + return { size, Nx / size.x, Ny / size.y, 2, 2, false }; +} + +// Smaller FFT with larger workgroups are not always possible to create. +static bool is_radix_valid(unsigned Nx, unsigned Ny, + Mode mode, unsigned vector_size, unsigned radix, + WorkGroupSize size, + bool pow2_stride) +{ + auto res = build_radix(Nx, Ny, + mode, vector_size, false, radix, + size, + pow2_stride); + + return res.num_workgroups_x > 0 && res.num_workgroups_y > 0; +} + +static double find_cost(unsigned Nx, unsigned Ny, Mode mode, unsigned radix, + const FFTOptions &options, const FFTWisdom &wisdom) +{ + auto opt = wisdom.find_optimal_options(Nx, Ny, radix, mode, SSBO, SSBO, options.type); + + // Return a very rough estimate if we cannot find cost. + // The cost functions generated here are expected to be huge, + // always much larger than true cost functions. + // The purpose of this is to give a strong bias towards radices we have wisdom for. + // We also give a bias towards larger radices, since they are generally more BW efficient. + return opt ? opt->first.cost : Nx * Ny * (log2(float(radix)) + 2.0f); +} + +struct CostPropagate +{ + CostPropagate() = default; + CostPropagate(double cost, vector radices) + : cost(cost), radices(move(radices)) {} + + void merge_if_better(const CostPropagate &a, const CostPropagate &b) + { + double new_cost = a.cost + b.cost; + + if ((cost == 0.0 || new_cost < cost) && a.cost != 0.0 && b.cost != 0.0) + { + cost = new_cost; + radices = a.radices; + radices.insert(end(radices), begin(b.radices), end(b.radices)); + } + } + + double cost = 0.0; + vector radices; +}; + +static vector split_radices(unsigned Nx, unsigned Ny, Mode mode, Target input_target, Target output_target, + const FFTOptions &options, + bool pow2_stride, const FFTWisdom &wisdom, double &accumulate_cost) +{ + unsigned N; + switch (mode) + { + case Vertical: + case VerticalDual: + N = Ny; + break; + + case Horizontal: + case HorizontalDual: + N = Nx; + break; + + default: + return {}; + } + + // N == 1 is for things like Nx1 transforms where we don't do any vertical transforms. + if (N == 1) + { + return {}; + } + + // Treat cost 0.0 as invalid. + double cost_table[8] = {0.0}; + CostPropagate cost_propagate[32]; + + // Fill table with fastest known ways to do radix 4, radix 8, radix 16, and 64. + // We'll then find the optimal subdivision which has the lowest additive cost. + cost_table[2] = find_cost(Nx, Ny, mode, 4, options, wisdom); + cost_table[3] = find_cost(Nx, Ny, mode, 8, options, wisdom); + cost_table[4] = find_cost(Nx, Ny, mode, 16, options, wisdom); + cost_table[6] = find_cost(Nx, Ny, mode, 64, options, wisdom); + + auto is_valid = [&](unsigned radix) -> bool { + unsigned workgroup_size_z = radix_to_wg_z(radix); + auto &opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options); + + // We don't want pow2_stride to round up a very inefficient work group and make the is_valid test pass. + return is_radix_valid(Nx, Ny, + mode, opt.vector_size, radix, + { opt.workgroup_size_x, opt.workgroup_size_y, workgroup_size_z }, + false); + }; + + // If our work-space is too small to allow certain radices, we disable them from consideration here. + for (unsigned i = 2; i <= 6; i++) + { + // Don't check the composite radix. + if (i == 5) + { + continue; + } + + if (is_valid(1 << i)) + { + cost_propagate[i] = CostPropagate(cost_table[i], { 1u << i }); + } + } + + // Now start bubble this up all the way to N, starting from radix 16. + for (unsigned i = 4; (1u << i) <= N; i++) + { + auto &target = cost_propagate[i]; + + for (unsigned r = 2; i - r >= r; r++) + { + target.merge_if_better(cost_propagate[r], cost_propagate[i - r]); + } + + if ((1u << i) == N && target.cost == 0.0) + { + throw logic_error("There is no possible subdivision ...\n"); + } + } + + // Ensure that the radix splits are sensible. + // A radix-N non p-1 transform mandates that p factor is at least N. + // Sort the splits so that larger radices come first. + // For composite radices like 16 and 64, they are built with 4x4 and 8x8, so we only + // need p factors for 4 and 8 for those cases. + // The cost function doesn't depend in which order we split the radices. + auto &cost = cost_propagate[unsigned(log2(float(N)))]; + auto radices = move(cost.radices); + + sort(begin(radices), end(radices), greater()); + + if (accumulate(begin(radices), end(radices), 1u, multiplies()) != N) + { + throw logic_error("Radix splits are invalid."); + } + + vector radices_out; + radices_out.reserve(radices.size()); + + // Fill in the structs with all information. + for (auto radix : radices) + { + bool first = radices_out.empty(); + bool last = radices_out.size() + 1 == radices.size(); + + // Use known performance options as a fallback. + // We used SSBO -> SSBO cost functions to find the optimal radix splits, + // but replace first and last options with Image -> SSBO / SSBO -> Image cost functions if appropriate. + auto &orig_opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options); + auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, + first ? input_target : SSBO, + last ? output_target : SSBO, + { orig_opt, options.type }); + + radices_out.push_back(build_radix(Nx, Ny, + mode, opts.vector_size, opts.shared_banked, radix, + { opts.workgroup_size_x, opts.workgroup_size_y, radix_to_wg_z(radix) }, + pow2_stride)); + } + + accumulate_cost += cost.cost; + return radices_out; +} + +Program* ProgramCache::find_program(const Parameters ¶meters) const +{ + auto itr = programs.find(parameters); + if (itr != end(programs)) + { + return itr->second.get(); + } + else + { + return nullptr; + } +} + +void ProgramCache::insert_program(const Parameters ¶meters, std::unique_ptr program) +{ + programs[parameters] = move(program); +} + +Program* FFT::get_program(const Parameters ¶ms) +{ + Program *prog = cache->find_program(params); + if (!prog) + { + auto newprog = build_program(params); + if (!newprog) + { + throw runtime_error("Failed to compile shader.\n"); + } + prog = newprog.get(); + cache->insert_program(params, move(newprog)); + } + return prog; +} + +static inline unsigned mode_to_input_components(Mode mode) +{ + switch (mode) + { + case HorizontalDual: + case VerticalDual: + return 4; + + case Horizontal: + case Vertical: + case ResolveComplexToReal: + return 2; + + case ResolveRealToComplex: + return 1; + + default: + return 0; + } +} + +FFT::FFT(Context *context, unsigned Nx, unsigned Ny, + unsigned radix, unsigned p, + Mode mode, Target input_target, Target output_target, + std::shared_ptr program_cache, const FFTOptions &options) + : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny) +{ + set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny); + + if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1))) + { + throw logic_error("FFT size is not POT."); + } + + if (p != 1 && input_target != SSBO) + { + throw logic_error("P != 1 only supported with SSBO as input."); + } + + if (p < radix && output_target != SSBO) + { + throw logic_error("P < radix only supported with SSBO as output."); + } + + // We don't really care about transform direction since it's just a matter of sign-flipping twiddles, + // but we have to obey some fundamental assumptions of resolve passes. + Direction direction = mode == ResolveComplexToReal ? Inverse : Forward; + + Radix res; + if (mode == ResolveRealToComplex || mode == ResolveComplexToReal) + { + res = build_resolve_radix(Nx, Ny, { options.performance.workgroup_size_x, options.performance.workgroup_size_y, 1 }); + } + else + { + res = build_radix(Nx, Ny, + mode, options.performance.vector_size, options.performance.shared_banked, radix, + { options.performance.workgroup_size_x, options.performance.workgroup_size_y, radix_to_wg_z(radix) }, + false); + } + + const Parameters params = { + res.size.x, + res.size.y, + res.size.z, + res.radix, + res.vector_size, + direction, + mode, + input_target, + output_target, + p == 1, + res.shared_banked, + options.type.fp16, options.type.input_fp16, options.type.output_fp16, + options.type.normalize, + }; + + if (res.num_workgroups_x == 0 || res.num_workgroups_y == 0) + { + throw logic_error("Invalid workgroup sizes for this radix."); + } + + unsigned uv_scale_x = res.vector_size / mode_to_input_components(mode); + const Pass pass = { + params, + res.num_workgroups_x, res.num_workgroups_y, + uv_scale_x, + next_pow2(res.num_workgroups_x * params.workgroup_size_x), + get_program(params), + }; + + passes.push_back(pass); +} + +static inline void print_radix_splits(Context *context, const vector radices[2]) +{ + context->log("Transform #1\n"); + for (auto &radix : radices[0]) + { + context->log(" Size: (%u, %u, %u)\n", + radix.size.x, radix.size.y, radix.size.z); + context->log(" Dispatch: (%u, %u)\n", + radix.num_workgroups_x, radix.num_workgroups_y); + context->log(" Radix: %u\n", + radix.radix); + context->log(" VectorSize: %u\n\n", + radix.vector_size); + } + + context->log("Transform #2\n"); + for (auto &radix : radices[1]) + { + context->log(" Size: (%u, %u, %u)\n", + radix.size.x, radix.size.y, radix.size.z); + context->log(" Dispatch: (%u, %u)\n", + radix.num_workgroups_x, radix.num_workgroups_y); + context->log(" Radix: %u\n", + radix.radix); + context->log(" VectorSize: %u\n\n", + radix.vector_size); + } +} + +static inline unsigned type_to_input_components(Type type) +{ + switch (type) + { + case ComplexToComplex: + case ComplexToReal: + return 2; + + case RealToComplex: + return 1; + + case ComplexToComplexDual: + return 4; + + default: + return 0; + } +} + +FFT::FFT(Context *context, unsigned Nx, unsigned Ny, + Type type, Direction direction, Target input_target, Target output_target, + std::shared_ptr program_cache, const FFTOptions &options, const FFTWisdom &wisdom) + : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny) +{ + set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny); + + size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2); + temp_buffer_size >>= options.type.output_fp16; + + temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); + if (output_target != SSBO) + { + temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); + } + + bool expand = false; + if (type == ComplexToReal || type == RealToComplex) + { + // If we're doing C2R or R2C, we'll need double the scratch memory, + // so make sure we're dividing Nx *after* allocating. + Nx /= 2; + expand = true; + } + + // Sanity checks. + if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1))) + { + throw logic_error("FFT size is not POT."); + } + + if (type == ComplexToReal && direction == Forward) + { + throw logic_error("ComplexToReal transforms requires inverse transform."); + } + + if (type == RealToComplex && direction != Forward) + { + throw logic_error("RealToComplex transforms requires forward transform."); + } + + if (type == RealToComplex && input_target == Image) + { + throw logic_error("Input real-to-complex must use ImageReal target."); + } + + if (type == ComplexToReal && output_target == Image) + { + throw logic_error("Output complex-to-real must use ImageReal target."); + } + + vector radices[2]; + Mode modes[2]; + Target targets[4]; + + switch (direction) + { + case Forward: + modes[0] = type == ComplexToComplexDual ? HorizontalDual : Horizontal; + modes[1] = type == ComplexToComplexDual ? VerticalDual : Vertical; + + targets[0] = input_target; + targets[1] = Ny > 1 ? SSBO : output_target; + targets[2] = targets[1]; + targets[3] = output_target; + + radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, false, wisdom, cost); + radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, expand, wisdom, cost); + break; + + case Inverse: + case InverseConvolve: + modes[0] = type == ComplexToComplexDual ? VerticalDual : Vertical; + modes[1] = type == ComplexToComplexDual ? HorizontalDual : Horizontal; + + targets[0] = input_target; + targets[1] = Ny > 1 ? SSBO : input_target; + targets[2] = targets[1]; + targets[3] = output_target; + + radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, expand, wisdom, cost); + radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, false, wisdom, cost); + break; + } + +#if 0 + print_radix_splits(context, radices); +#endif + + passes.reserve(radices[0].size() + radices[1].size() + expand); + + unsigned index = 0; + unsigned last_index = (radices[1].empty() && !expand) ? 0 : 1; + + for (auto &radix_direction : radices) + { + unsigned p = 1; + unsigned i = 0; + + for (auto &radix : radix_direction) + { + // If this is the last pass and we're writing to an image, use a special shader variant. + bool last_pass = index == last_index && i == radix_direction.size() - 1; + + bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16; + Target out_target = last_pass ? output_target : SSBO; + Target in_target = passes.empty() ? input_target : SSBO; + Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction; + unsigned uv_scale_x = radix.vector_size / type_to_input_components(type); + + const Parameters params = { + radix.size.x, + radix.size.y, + radix.size.z, + radix.radix, + radix.vector_size, + dir, + modes[index], + in_target, + out_target, + p == 1, + radix.shared_banked, + options.type.fp16, input_fp16, options.type.output_fp16, + options.type.normalize, + }; + + const Pass pass = { + params, + radix.num_workgroups_x, radix.num_workgroups_y, + uv_scale_x, + next_pow2(radix.num_workgroups_x * params.workgroup_size_x), + get_program(params), + }; + + passes.push_back(pass); + + p *= radix.radix; + i++; + } + + // After the first transform direction, inject either a real-to-complex resolve or complex-to-real resolve. + // This way, we avoid having special purpose transforms for all FFT variants. + if (index == 0 && (type == ComplexToReal || type == RealToComplex)) + { + bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16; + bool last_pass = radices[1].empty(); + Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction; + Target in_target = passes.empty() ? input_target : SSBO; + Target out_target = last_pass ? output_target : SSBO; + Mode mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex; + unsigned uv_scale_x = 1; + + auto base_opts = options; + base_opts.type.input_fp16 = input_fp16; + + auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts); + auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 }); + + const Parameters params = { + res.size.x, + res.size.y, + res.size.z, + res.radix, + res.vector_size, + dir, + mode, + in_target, + out_target, + true, + false, + base_opts.type.fp16, base_opts.type.input_fp16, base_opts.type.output_fp16, + base_opts.type.normalize, + }; + + const Pass pass = { + params, + Nx / res.size.x, + Ny / res.size.y, + uv_scale_x, + next_pow2(Nx), + get_program(params), + }; + + passes.push_back(pass); + } + + index++; + } +} + +string FFT::load_shader_string(const char *path) +{ + ifstream file(path); + if (!file.good()) + { + throw runtime_error("Failed to load shader file from disk.\n"); + } + stringstream buf; + buf << file.rdbuf(); + return buf.str(); +} + +void FFT::store_shader_string(const char *path, const string &source) +{ + ofstream file(path); + file.write(source.data(), source.size()); +} + +unique_ptr FFT::build_program(const Parameters ¶ms) +{ + string str; + str.reserve(16 * 1024); + +#if 0 + context->log("Building program:\n"); + context->log( + " WG_X: %u\n" + " WG_Y: %u\n" + " WG_Z: %u\n" + " P1: %u\n" + " Radix: %u\n" + " Dir: %d\n" + " Mode: %u\n" + " InTarget: %u\n" + " OutTarget: %u\n" + " FP16: %u\n" + " InFP16: %u\n" + " OutFP16: %u\n" + " Norm: %u\n", + params.workgroup_size_x, + params.workgroup_size_y, + params.workgroup_size_z, + params.p1, + params.radix, + params.direction, + params.mode, + params.input_target, + params.output_target, + params.fft_fp16, + params.input_fp16, + params.output_fp16, + params.fft_normalize); +#endif + + if (params.p1) + { + str += "#define FFT_P1\n"; + } + + if (params.fft_fp16) + { + str += "#define FFT_FP16\n"; + } + + if (params.input_fp16) + { + str += "#define FFT_INPUT_FP16\n"; + } + + if (params.output_fp16) + { + str += "#define FFT_OUTPUT_FP16\n"; + } + + if (params.fft_normalize) + { + str += "#define FFT_NORMALIZE\n"; + } + + if (params.direction == InverseConvolve) + { + str += "#define FFT_CONVOLVE\n"; + } + + str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n"; + + str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n"; + str += string("#define FFT_RADIX ") + to_string(params.radix) + "\n"; + + unsigned vector_size = params.vector_size; + switch (params.mode) + { + case VerticalDual: + str += "#define FFT_DUAL\n"; + str += "#define FFT_VERT\n"; + break; + + case Vertical: + str += "#define FFT_VERT\n"; + break; + + case HorizontalDual: + str += "#define FFT_DUAL\n"; + str += "#define FFT_HORIZ\n"; + break; + + case Horizontal: + str += "#define FFT_HORIZ\n"; + break; + + case ResolveRealToComplex: + str += "#define FFT_RESOLVE_REAL_TO_COMPLEX\n"; + str += "#define FFT_HORIZ\n"; + vector_size = 2; + break; + + case ResolveComplexToReal: + str += "#define FFT_RESOLVE_COMPLEX_TO_REAL\n"; + str += "#define FFT_HORIZ\n"; + vector_size = 2; + break; + } + + switch (params.input_target) + { + case ImageReal: + str += "#define FFT_INPUT_REAL\n"; + // Fallthrough + case Image: + str += "#define FFT_INPUT_TEXTURE\n"; + break; + + default: + break; + } + + switch (params.output_target) + { + case ImageReal: + str += "#define FFT_OUTPUT_REAL\n"; + // Fallthrough + case Image: + str += "#define FFT_OUTPUT_IMAGE\n"; + break; + + default: + break; + } + + switch (vector_size) + { + case 2: + str += "#define FFT_VEC2\n"; + break; + + case 4: + str += "#define FFT_VEC4\n"; + break; + + case 8: + str += "#define FFT_VEC8\n"; + break; + } + + str += string("layout(local_size_x = ") + + to_string(params.workgroup_size_x) + + ", local_size_y = " + + to_string(params.workgroup_size_y) + + ", local_size_z = " + + to_string(params.workgroup_size_z) + + ") in;\n"; + + str += load_shader_string("glfft/glsl/fft_common.comp"); + switch (params.radix) + { + case 4: + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp"); + break; + + case 8: + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp"); + break; + + case 16: + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp"); + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp"); + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix16.comp"); + break; + + case 64: + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp"); + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp"); + str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix64.comp"); + break; + } + str += load_shader_string("glfft/glsl/fft_main.comp"); + + auto prog = context->compile_compute_shader(str.c_str()); + if (!prog) + { + puts(str.c_str()); + } + +#if 0 + char shader_path[1024]; + snprintf(shader_path, sizeof(shader_path), "glfft_shader_radix%u_first%u_mode%u_in_target%u_out_target%u.comp.src", + params.radix, params.p1, params.mode, unsigned(params.input_target), unsigned(params.output_target)); + store_shader_string(shader_path, str); +#endif + + return prog; +} + +double FFT::bench(Context *context, Resource *output, Resource *input, + unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time) +{ + context->wait_idle(); + auto *cmd = context->request_command_buffer(); + for (unsigned i = 0; i < warmup_iterations; i++) + { + process(cmd, output, input); + } + context->submit_command_buffer(cmd); + context->wait_idle(); + + unsigned runs = 0; + double start_time = context->get_time(); + double total_time = 0.0; + + for (unsigned i = 0; i < iterations && (((context->get_time() - start_time) < max_time) || i == 0); i++) + { +#ifdef GLFFT_CLI_ASYNC + check_async_cancel(); +#endif + + auto *cmd = context->request_command_buffer(); + + double iteration_start = context->get_time(); + for (unsigned d = 0; d < dispatches_per_iteration; d++) + { + process(cmd, output, input); + cmd->barrier(); + runs++; + } + + context->submit_command_buffer(cmd); + context->wait_idle(); + + double iteration_end = context->get_time(); + total_time += iteration_end - iteration_start; + } + + return total_time / runs; +} + +void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux) +{ + if (passes.empty()) + { + return; + } + + Resource *buffers[2] = { + input, + passes.size() & 1 ? + (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) : + temp_buffer.get(), + }; + + if (input_aux != 0) + { + if (passes.front().parameters.input_target != SSBO) + { + cmd->bind_texture(BindingTexture1, static_cast(input_aux)); + cmd->bind_sampler(BindingTexture1, texture.samplers[1]); + } + else + { + if (ssbo.input_aux.size != 0) + { + cmd->bind_storage_buffer_range(BindingSSBOAux, + ssbo.input_aux.offset, ssbo.input_aux.size, static_cast(input_aux)); + } + else + { + cmd->bind_storage_buffer(BindingSSBOAux, static_cast(input_aux)); + } + } + } + + Program *current_program = nullptr; + unsigned p = 1; + unsigned pass_index = 0; + + struct FFTConstantData + { + uint32_t p; + uint32_t stride; + uint32_t padding[2]; + float offset_x, offset_y; + float scale_x, scale_y; + }; + + for (auto &pass : passes) + { + if (pass.program != current_program) + { + cmd->bind_program(pass.program); + current_program = pass.program; + } + + if (pass.parameters.p1) + { + p = 1; + } + + FFTConstantData constant_data; + constant_data.p = p; + constant_data.stride = pass.stride; + p *= pass.parameters.radix; + + if (pass.parameters.input_target != SSBO) + { + cmd->bind_texture(BindingTexture0, static_cast(buffers[0])); + cmd->bind_sampler(BindingTexture0, texture.samplers[0]); + + // If one compute thread reads multiple texels in X dimension, scale this accordingly. + float scale_x = texture.scale_x * pass.uv_scale_x; + + constant_data.offset_x = texture.offset_x; + constant_data.offset_y = texture.offset_y; + constant_data.scale_x = scale_x; + constant_data.scale_y = texture.scale_y; + } + else + { + if (buffers[0] == input && ssbo.input.size != 0) + { + cmd->bind_storage_buffer_range(BindingSSBOIn, + ssbo.input.offset, ssbo.input.size, static_cast(buffers[0])); + } + else if (buffers[0] == output && ssbo.output.size != 0) + { + cmd->bind_storage_buffer_range(BindingSSBOIn, + ssbo.output.offset, ssbo.output.size, static_cast(buffers[0])); + } + else + { + cmd->bind_storage_buffer(BindingSSBOIn, static_cast(buffers[0])); + } + } + + if (pass.parameters.output_target != SSBO) + { + Format format = FormatUnknown; + + // TODO: Make this more flexible, would require shader variants per-format though. + if (pass.parameters.output_target == ImageReal) + { + format = FormatR32Float; + } + else + { + switch (pass.parameters.mode) + { + case VerticalDual: + case HorizontalDual: + format = FormatR16G16B16A16Float; + break; + + case Vertical: + case Horizontal: + case ResolveRealToComplex: + format = FormatR32Uint; + break; + + default: + break; + } + } + cmd->bind_storage_texture(BindingImage, static_cast(output), format); + } + else + { + if (buffers[1] == output && ssbo.output.size != 0) + { + cmd->bind_storage_buffer_range(BindingSSBOOut, + ssbo.output.offset, ssbo.output.size, static_cast(buffers[1])); + } + else + { + cmd->bind_storage_buffer(BindingSSBOOut, static_cast(buffers[1])); + } + } + + cmd->push_constant_data(BindingUBO, &constant_data, sizeof(constant_data)); + cmd->dispatch(pass.workgroups_x, pass.workgroups_y, 1); + + // For last pass, we don't know how our resource will be used afterwards, + // so let barrier decisions be up to the API user. + if (pass_index + 1 < passes.size()) + { + cmd->barrier(static_cast(buffers[1])); + } + + if (pass_index == 0) + { + buffers[0] = passes.size() & 1 ? + temp_buffer.get() : + (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output); + } + + swap(buffers[0], buffers[1]); + pass_index++; + } +} + diff --git a/glfft/glfft.hpp b/glfft/glfft.hpp new file mode 100644 index 0000000..c3b1a69 --- /dev/null +++ b/glfft/glfft.hpp @@ -0,0 +1,225 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef GLFFT_HPP__ +#define GLFFT_HPP__ + +#include "glfft_interface.hpp" +#include "glfft_common.hpp" +#include "glfft_wisdom.hpp" +#include +#include +#include + +/// GLFFT doesn't try to preserve GL state in any way. +/// E.g. SHADER_STORAGE_BUFFER bindings, programs bound, texture bindings, etc. +/// Applications calling this library must expect that some GL state will be modified. +/// No rendering state associated with graphics will be modified. + +namespace GLFFT +{ + +class FFT +{ + public: + /// @brief Creates a full FFT. + /// + /// All buffer allocation done by GLFFT will be done in constructor. + /// Will throw if invalid parameters are passed. + /// + /// @param context The graphics context. + /// @param Nx Number of samples in horizontal dimension. + /// @param Ny Number of samples in vertical dimension. + /// @param type The transform type. + /// @param direction Forward, inverse or inverse with convolution. + /// For real-to-complex and complex-to-real transforms, the + /// transform type must match. + /// @param input_target GL object type of input target. For real-to-complex with texture as input, ImageReal is used. + /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used. + /// @param cache A program cache for caching the GLFFT programs created. + /// @param options FFT options such as performance related parameters and types. + /// @param wisdom GLFFT wisdom which can override performance related options + /// (options.performance is used as a fallback). + FFT(Context *context, unsigned Nx, unsigned Ny, + Type type, Direction direction, Target input_target, Target output_target, + std::shared_ptr cache, const FFTOptions &options, + const FFTWisdom &wisdom = FFTWisdom()); + + /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs. + /// + /// All buffer allocation done by GLFFT will be done in constructor. + /// Will throw if invalid parameters are passed. + /// + /// @param context The graphics context. + /// @param Nx Number of samples in horizontal dimension. + /// @param Ny Number of samples in vertical dimension. + /// @param radix FFT radix to test. + /// @param p Accumulated p factor. If 1, "first pass" mode is tested, otherwise, generic FFT stages. + /// @param mode The transform mode. + /// @param input_target GL object type of input target. For real-to-complex with texture as input, ImageReal is used. + /// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used. + /// @param cache A program cache for caching the GLFFT programs created. + /// @param options FFT options such as performance related parameters and types. + FFT(Context *context, unsigned Nx, unsigned Ny, unsigned radix, unsigned p, + Mode mode, Target input_target, Target output_target, + std::shared_ptr cache, const FFTOptions &options); + + /// @brief Process the FFT. + /// + /// The type of object passed here must match what FFT was initialized with. + /// + /// @param cmd Command buffer for issuing dispatch commands. + /// @param output Output buffer or image. + /// NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D! + /// @param input Input buffer or texture. + /// @param input_aux If using convolution transform type, + /// the content of input and input_aux will be multiplied together. + void process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux = nullptr); + + /// @brief Run process() multiple times, timing the results. + /// + /// Mostly used internally by GLFFT wisdom, glfft_cli's bench, and so on. + /// + /// @param context The graphics context. + /// @param output Output buffer or image. + /// NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D! + /// @param input Input buffer or texture. + /// @param warmup_iterations Number of iterations to run to "warm" up GL, ensures we don't hit + /// recompilations or similar when benching. + /// @param iterations Number of iterations to run the benchmark. + /// Each iteration will ensure timing with a glFinish() followed by timing. + /// @param dispatches_per_iteration Number of calls to process() we should do per iteration. + /// @param max_time The max time the benchmark should run. Will be checked after each iteration is complete. + /// + /// @returns Average GPU time per process() call. + double bench(Context *context, Resource *output, Resource *input, + unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, + double max_time = std::numeric_limits::max()); + + /// @brief Returns cost for a process() call. Only used for debugging. + double get_cost() const { return cost; } + + /// @brief Returns number of passes (glDispatchCompute) in a process() call. + unsigned get_num_passes() const { return passes.size(); } + + /// @brief Returns Nx. + unsigned get_dimension_x() const { return size_x; } + /// @brief Returns Ny. + unsigned get_dimension_y() const { return size_y; } + + /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures. + /// + /// By default, these values are 0.5 / size (samples in the center of texel (0, 0)). + /// Scale is 1.0 / size, so it steps one texel for each coordinate in the FFT transform. + /// Setting this to something custom is useful to get downsampling with GL_LINEAR -> FFT transform + /// without having to downsample the texture first, then FFT. + void set_texture_offset_scale(float offset_x, float offset_y, float scale_x, float scale_y) + { + texture.offset_x = offset_x; + texture.offset_y = offset_y; + texture.scale_x = scale_x; + texture.scale_y = scale_y; + } + + /// @brief Set binding range for input. + /// + /// If input is an SSBO, set a custom binding range to be passed to glBindBufferRange. + /// By default, the entire buffer is bound. + void set_input_buffer_range(size_t offset, size_t size) + { + ssbo.input.offset = offset; + ssbo.input.size = size; + } + + /// @brief Set binding range for input_aux. + /// + /// If input_aux is an SSBO, set a custom binding range to be passed to glBindBufferRange. + /// By default, the entire buffer is bound. + void set_input_aux_buffer_range(size_t offset, size_t size) + { + ssbo.input_aux.offset = offset; + ssbo.input_aux.size = size; + } + + /// @brief Set binding range for output. + /// + /// If output buffer is an SSBO, set a custom binding range to be passed to glBindBufferRange. + /// By default, the entire buffer is bound. + void set_output_buffer_range(size_t offset, size_t size) + { + ssbo.output.offset = offset; + ssbo.output.size = size; + } + + /// @brief Set samplers for input textures. + /// + /// Set sampler objects to be used for input and input_aux if textures are used as input. + /// By default, sampler object 0 will be used (inheriting sampler parameters from the texture object itself). + void set_samplers(Sampler *sampler0, Sampler *sampler1 = nullptr) + { + texture.samplers[0] = sampler0; + texture.samplers[1] = sampler1; + } + + private: + Context *context; + + struct Pass + { + Parameters parameters; + + unsigned workgroups_x; + unsigned workgroups_y; + unsigned uv_scale_x; + unsigned stride; + Program *program; + }; + + double cost = 0.0; + + std::unique_ptr temp_buffer; + std::unique_ptr temp_buffer_image; + std::vector passes; + std::shared_ptr cache; + + std::unique_ptr build_program(const Parameters ¶ms); + static std::string load_shader_string(const char *path); + static void store_shader_string(const char *path, const std::string &source); + + Program* get_program(const Parameters ¶ms); + + struct + { + float offset_x = 0.0f, offset_y = 0.0f, scale_x = 1.0f, scale_y = 1.0f; + Sampler *samplers[2] = { nullptr, nullptr }; + } texture; + + struct + { + struct + { + size_t offset = 0; + size_t size = 0; + } input, input_aux, output; + } ssbo; + unsigned size_x, size_y; +}; + +} + +#endif diff --git a/glfft/glfft_common.hpp b/glfft/glfft_common.hpp new file mode 100644 index 0000000..4065f68 --- /dev/null +++ b/glfft/glfft_common.hpp @@ -0,0 +1,178 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +// For the most part used by the implementation. + +#ifndef GLFFT_COMMON_HPP__ +#define GLFFT_COMMON_HPP__ + +#include "glfft_interface.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace GLFFT +{ + +enum Direction +{ + /// Forward FFT transform. + Forward = -1, + /// Inverse FFT transform, but with two inputs (in frequency domain) which are multiplied together + /// for convolution. + InverseConvolve = 0, + /// Inverse FFT transform. + Inverse = 1 +}; + +enum Mode +{ + Horizontal, + HorizontalDual, + Vertical, + VerticalDual, + + ResolveRealToComplex, + ResolveComplexToReal, +}; + +enum Type +{ + /// Regular complex-to-complex transform. + ComplexToComplex, + /// Complex-to-complex dual transform where the complex value is four-dimensional, + /// i.e. a vector of two complex values. Typically used to transform RGBA data. + ComplexToComplexDual, + /// Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex samples. + ComplexToReal, + /// Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N complex samples. + RealToComplex +}; + +enum Target +{ + /// GL_SHADER_STORAGE_BUFFER + SSBO, + /// Textures, when used as output, type is determined by transform type. + /// ComplexToComplex / RealToComplex -> GL_RG16F + /// ComplexToComplexDual -> GL_RGBA16F + Image, + /// Real-valued (single component) textures, when used as output, type is determined by transform type. + /// ComplexToReal -> GL_R32F (because GLES 3.1 doesn't have GL_R16F image type). + ImageReal +}; + +struct Parameters +{ + unsigned workgroup_size_x; + unsigned workgroup_size_y; + unsigned workgroup_size_z; + unsigned radix; + unsigned vector_size; + Direction direction; + Mode mode; + Target input_target; + Target output_target; + bool p1; + bool shared_banked; + bool fft_fp16, input_fp16, output_fp16; + bool fft_normalize; + + bool operator==(const Parameters &other) const + { + return std::memcmp(this, &other, sizeof(Parameters)) == 0; + } +}; + +/// @brief Options for FFT implementation. +/// Defaults for performance as conservative. +struct FFTOptions +{ + struct Performance + { + /// Workgroup size used in layout(local_size_x). + /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible. + /// FFT constructor will throw in this case. + unsigned workgroup_size_x = 4; + /// Workgroup size used in layout(local_size_x). + /// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible. + /// FFT constructor will throw in this case. + unsigned workgroup_size_y = 1; + /// Vector size. Very GPU dependent. "Scalar" GPUs prefer 2 here, vector GPUs prefer 4 (and maybe 8). + unsigned vector_size = 2; + /// Whether to use banked shared memory or not. + /// Desktop GPUs prefer true here, false for mobile in general. + bool shared_banked = false; + } performance; + + struct Type + { + /// Whether internal shader should be mediump float. + bool fp16 = false; + /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32. + bool input_fp16 = false; + /// Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32. + bool output_fp16 = false; + /// Whether to apply 1 / N normalization factor. + bool normalize = false; + } type; +}; + +} + +namespace std +{ + template<> + struct hash + { + std::size_t operator()(const GLFFT::Parameters ¶ms) const + { + std::size_t h = 0; + hash hasher; + for (std::size_t i = 0; i < sizeof(GLFFT::Parameters); i++) + { + h ^= hasher(reinterpret_cast(¶ms)[i]); + } + + return h; + } + }; +} + +namespace GLFFT +{ + +class ProgramCache +{ + public: + Program* find_program(const Parameters ¶meters) const; + void insert_program(const Parameters ¶meters, std::unique_ptr program); + size_t cache_size() const { return programs.size(); } + + private: + std::unordered_map> programs; +}; + +} + +#endif + diff --git a/glfft/glfft_gl_api_headers.hpp b/glfft/glfft_gl_api_headers.hpp new file mode 100644 index 0000000..d4714e0 --- /dev/null +++ b/glfft/glfft_gl_api_headers.hpp @@ -0,0 +1,6 @@ + +/* Let GLFFT use GLava's headers */ +#define GLFFT_GLSL_LANG_STRING "#version 430 core\n" +extern "C" { + #include "../glava/glad.h" +} diff --git a/glfft/glfft_gl_interface.cpp b/glfft/glfft_gl_interface.cpp new file mode 100644 index 0000000..47e7ef4 --- /dev/null +++ b/glfft/glfft_gl_interface.cpp @@ -0,0 +1,310 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "glfft_gl_interface.hpp" +#ifdef GLFFT_GL_DEBUG +#include "glfft_validate.hpp" +#endif +#include +#include +#include + +using namespace GLFFT; +using namespace std; + +GLCommandBuffer GLContext::static_command_buffer; + +void GLCommandBuffer::bind_program(Program *program) +{ + glUseProgram(program ? static_cast(program)->name : 0); +} + +void GLCommandBuffer::bind_storage_texture(unsigned binding, Texture *texture, Format format) +{ + glBindImageTexture(binding, static_cast(texture)->name, + 0, GL_FALSE, 0, GL_WRITE_ONLY, convert(format)); +} + +void GLCommandBuffer::bind_texture(unsigned binding, Texture *texture) +{ + glActiveTexture(GL_TEXTURE0 + binding); + glBindTexture(GL_TEXTURE_2D, static_cast(texture)->name); +} + +void GLCommandBuffer::bind_sampler(unsigned binding, Sampler *sampler) +{ + glBindSampler(binding, sampler ? static_cast(sampler)->name : 0); +} + +void GLCommandBuffer::bind_storage_buffer(unsigned binding, Buffer *buffer) +{ + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding, static_cast(buffer)->name); +} + +void GLCommandBuffer::bind_storage_buffer_range(unsigned binding, size_t offset, size_t size, Buffer *buffer) +{ + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, static_cast(buffer)->name, offset, size); +} + +void GLCommandBuffer::dispatch(unsigned x, unsigned y, unsigned z) +{ + glDispatchCompute(x, y, z); +} + +void GLCommandBuffer::barrier(Buffer*) +{ + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); +} + +void GLCommandBuffer::barrier(Texture*) +{ + glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT); +} + +void GLCommandBuffer::barrier() +{ + glMemoryBarrier(GL_ALL_BARRIER_BITS); +} + +void GLCommandBuffer::push_constant_data(unsigned binding, const void *data, size_t size) +{ + glBindBufferBase(GL_UNIFORM_BUFFER, binding, ubos[ubo_index]); + void *ptr = glMapBufferRange(GL_UNIFORM_BUFFER, + 0, CommandBuffer::MaxConstantDataSize, + GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT); + + if (ptr) + { + std::memcpy(ptr, data, size); + glUnmapBuffer(GL_UNIFORM_BUFFER); + } + + if (++ubo_index >= ubo_count) + ubo_index = 0; +} + +CommandBuffer* GLContext::request_command_buffer() +{ + if (!initialized_ubos) + { + glGenBuffers(MaxBuffersRing, ubos); + for (auto &ubo : ubos) + { + glBindBuffer(GL_UNIFORM_BUFFER, ubo); + glBufferData(GL_UNIFORM_BUFFER, CommandBuffer::MaxConstantDataSize, nullptr, GL_STREAM_DRAW); + } + static_command_buffer.set_constant_data_buffers(ubos, MaxBuffersRing); + initialized_ubos = true; + } + return &static_command_buffer; +} + +void GLContext::submit_command_buffer(CommandBuffer*) +{} + +void GLContext::wait_idle() +{ + glFinish(); +} + +unique_ptr GLContext::create_texture(const void *initial_data, + unsigned width, unsigned height, + Format format) +{ + return unique_ptr(new GLTexture(initial_data, width, height, format)); +} + +unique_ptr GLContext::create_buffer(const void *initial_data, size_t size, AccessMode access) +{ + return unique_ptr(new GLBuffer(initial_data, size, access)); +} + +unique_ptr GLContext::compile_compute_shader(const char *source) +{ +#ifdef GLFFT_GL_DEBUG + if (!validate_glsl_source(source)) + return nullptr; +#endif + + GLuint program = glCreateProgram(); + if (!program) + { + return nullptr; + } + + GLuint shader = glCreateShader(GL_COMPUTE_SHADER); + + const char *sources[] = { GLFFT_GLSL_LANG_STRING, source }; + glShaderSource(shader, 2, sources, NULL); + glCompileShader(shader); + + GLint status; + glGetShaderiv(shader, GL_COMPILE_STATUS, &status); + if (status == GL_FALSE) + { + GLint len; + GLsizei out_len; + + glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len); + vector buf(len); + glGetShaderInfoLog(shader, len, &out_len, buf.data()); + log("GLFFT: Shader log:\n%s\n\n", buf.data()); + + glDeleteShader(shader); + glDeleteProgram(program); + return 0; + } + + glAttachShader(program, shader); + glLinkProgram(program); + glDeleteShader(shader); + + glGetProgramiv(program, GL_LINK_STATUS, &status); + if (status == GL_FALSE) + { + GLint len; + GLsizei out_len; + glGetProgramiv(program, GL_INFO_LOG_LENGTH, &len); + vector buf(len); + glGetProgramInfoLog(program, len, &out_len, buf.data()); + log("Program log:\n%s\n\n", buf.data()); + + glDeleteProgram(program); + glDeleteShader(shader); + return nullptr; + } + + return unique_ptr(new GLProgram(program)); +} + +void GLContext::log(const char *fmt, ...) +{ + char buffer[4 * 1024]; + + va_list va; + va_start(va, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, va); + va_end(va); + glfft_log("%s", buffer); +} + +double GLContext::get_time() +{ + return glfft_time(); +} + +unsigned GLContext::get_max_work_group_threads() +{ + GLint value; + glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &value); + return value; +} + +const char* GLContext::get_renderer_string() +{ + return reinterpret_cast(glGetString(GL_RENDERER)); +} + +const void* GLContext::map(Buffer *buffer, size_t offset, size_t size) +{ + glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast(buffer)->name); + const void *ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, size, GL_MAP_READ_BIT); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); + return ptr; +} + +void GLContext::unmap(Buffer *buffer) +{ + glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast(buffer)->name); + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); +} + +void GLContext::teardown() +{ + if (initialized_ubos) + glDeleteBuffers(MaxBuffersRing, ubos); + initialized_ubos = false; +} + +GLContext::~GLContext() +{ + teardown(); +} + +GLTexture::GLTexture(const void *initial_data, + unsigned width, unsigned height, + Format format) +{ + glGenTextures(1, &name); + glBindTexture(GL_TEXTURE_2D, name); + glTexStorage2D(GL_TEXTURE_2D, 1, convert(format), width, height); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + if (initial_data) + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, + convert_format(format), convert_type(format), initial_data); + } + + glBindTexture(GL_TEXTURE_2D, 0); +} + +GLTexture::~GLTexture() +{ + if (owned) + glDeleteTextures(1, &name); +} + +GLBuffer::GLBuffer(const void *initial_data, size_t size, AccessMode access) +{ + glGenBuffers(1, &name); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, name); + glBufferData(GL_SHADER_STORAGE_BUFFER, size, initial_data, convert(access)); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); +} + +GLBuffer::~GLBuffer() +{ + if (owned) + glDeleteBuffers(1, &name); +} + +GLProgram::GLProgram(GLuint name) + : name(name) +{} + +GLProgram::~GLProgram() +{ + if (name != 0) + { + glDeleteProgram(name); + } +} + +GLSampler::~GLSampler() +{ + if (name != 0) + { + glDeleteSamplers(1, &name); + } +} + diff --git a/glfft/glfft_gl_interface.hpp b/glfft/glfft_gl_interface.hpp new file mode 100644 index 0000000..43929c7 --- /dev/null +++ b/glfft/glfft_gl_interface.hpp @@ -0,0 +1,258 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef GLFFT_GL_INTERFACE_HPP__ +#define GLFFT_GL_INTERFACE_HPP__ + +#include "glfft_interface.hpp" + +#include "glfft_gl_api_headers.hpp" + +/* GLava additions (POSIX) */ +extern "C" { + #include + #include + #include + #include + #include +} + +#ifndef GLFFT_GLSL_LANG_STRING +#error GLFFT_GLSL_LANG_STRING must be defined to e.g. "#version 310 es\n" or "#version 430 core\n". +#endif + +#ifndef GLFFT_LOG_OVERRIDE +void glfft_log(const char *fmt, ...) { + va_list l; + va_start(l, fmt); + vfprintf(stdout, fmt, l); + va_end(l); +} +#else +#define glfft_log GLFFT_LOG_OVERRIDE +#endif + +#ifndef GLFFT_TIME_OVERRIDE +double glfft_time() { + struct timespec tv; + if (clock_gettime(CLOCK_REALTIME, &tv)) { + fprintf(stderr, "clock_gettime(CLOCK_REALTIME, ...): %s\n", strerror(errno)); + } + return (double) tv.tv_sec + ((double) tv.tv_nsec / 1000000000.0); +} +#else +#define glfft_time GLFFT_TIME_OVERRIDE +#endif + +namespace GLFFT +{ + class GLContext; + + class GLTexture : public Texture + { + public: + friend class GLContext; + friend class GLCommandBuffer; + ~GLTexture(); + + GLTexture(GLuint obj) : name(obj), owned(false) {} + GLuint get() const { return name; } + + private: + GLTexture(const void *initial_data, + unsigned width, unsigned height, + Format format); + GLuint name; + bool owned = true; + }; + + // Not really used by test and bench code, but can be useful for API users. + class GLSampler : public Sampler + { + public: + friend class GLContext; + friend class GLCommandBuffer; + ~GLSampler(); + + GLSampler(GLuint obj) : name(obj) {} + GLuint get() const { return name; } + + private: + GLuint name; + }; + + class GLBuffer : public Buffer + { + public: + friend class GLContext; + friend class GLCommandBuffer; + ~GLBuffer(); + + GLBuffer(GLuint obj) : name(obj), owned(false) {} + GLuint get() const { return name; } + + private: + GLuint name; + GLBuffer(const void *initial_data, size_t size, AccessMode access); + bool owned = true; + }; + + class GLProgram : public Program + { + public: + friend class GLContext; + friend class GLCommandBuffer; + ~GLProgram(); + + GLuint get() const { return name; } + + private: + GLProgram(GLuint name); + GLuint name; + }; + + class GLCommandBuffer : public CommandBuffer + { + public: + ~GLCommandBuffer() = default; + + void set_constant_data_buffers(const GLuint *ubos, unsigned count) + { + this->ubos = ubos; + ubo_index = 0; + ubo_count = count; + } + + void bind_program(Program *program) override; + void bind_storage_texture(unsigned binding, Texture *texture, Format format) override; + void bind_texture(unsigned binding, Texture *texture) override; + void bind_sampler(unsigned binding, Sampler *sampler) override; + void bind_storage_buffer(unsigned binding, Buffer *texture) override; + void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) override; + void dispatch(unsigned x, unsigned y, unsigned z) override; + + void barrier(Buffer *buffer) override; + void barrier(Texture *buffer) override; + void barrier() override; + + void push_constant_data(unsigned binding, const void *data, size_t size) override; + + private: + const GLuint *ubos = nullptr; + unsigned ubo_count = 0; + unsigned ubo_index = 0; + }; + + class GLContext : public Context + { + public: + ~GLContext(); + + std::unique_ptr create_texture(const void *initial_data, + unsigned width, unsigned height, + Format format) override; + + std::unique_ptr create_buffer(const void *initial_data, size_t size, AccessMode access) override; + std::unique_ptr compile_compute_shader(const char *source) override; + + CommandBuffer* request_command_buffer() override; + void submit_command_buffer(CommandBuffer *cmd) override; + void wait_idle() override; + + const char* get_renderer_string() override; + void log(const char *fmt, ...) override; + double get_time() override; + + unsigned get_max_work_group_threads() override; + + const void* map(Buffer *buffer, size_t offset, size_t size) override; + void unmap(Buffer *buffer) override; + + // Not supported in GLES, so override when creating platform-specific context. + bool supports_texture_readback() override { return false; } + void read_texture(void*, Texture*, Format) override {} + + protected: + void teardown(); + + private: + static GLCommandBuffer static_command_buffer; + + enum { MaxBuffersRing = 256 }; + GLuint ubos[MaxBuffersRing]; + bool initialized_ubos = false; + }; + + static inline GLenum convert(AccessMode mode) + { + switch (mode) + { + case AccessStreamCopy: return GL_STREAM_COPY; + case AccessStaticCopy: return GL_STATIC_COPY; + case AccessStreamRead: return GL_STREAM_READ; + } + return 0; + } + + static inline GLenum convert(Format format) + { + switch (format) + { + case FormatR16G16B16A16Float: return GL_RGBA16F; + case FormatR32G32B32A32Float: return GL_RGBA32F; + case FormatR32Float: return GL_R32F; + case FormatR16G16Float: return GL_RG16F; + case FormatR32G32Float: return GL_RG32F; + case FormatR32Uint: return GL_R32UI; + case FormatUnknown: return 0; + } + return 0; + } + + static inline GLenum convert_format(Format format) + { + switch (format) + { + case FormatR16G16Float: return GL_RG; + case FormatR32G32Float: return GL_RG; + case FormatR16G16B16A16Float: return GL_RGBA; + case FormatR32G32B32A32Float: return GL_RGBA; + case FormatR32Float: return GL_RED; + case FormatR32Uint: return GL_RED_INTEGER; + case FormatUnknown: return 0; + } + return 0; + } + + static inline GLenum convert_type(Format format) + { + switch (format) + { + case FormatR16G16Float: return GL_HALF_FLOAT; + case FormatR16G16B16A16Float: return GL_HALF_FLOAT; + case FormatR32Float: return GL_FLOAT; + case FormatR32G32Float: return GL_FLOAT; + case FormatR32G32B32A32Float: return GL_FLOAT; + case FormatR32Uint: return GL_UNSIGNED_INT; + case FormatUnknown: return 0; + } + return 0; + } +} + +#endif diff --git a/glfft/glfft_interface.hpp b/glfft/glfft_interface.hpp new file mode 100644 index 0000000..c2cc909 --- /dev/null +++ b/glfft/glfft_interface.hpp @@ -0,0 +1,131 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef GLFFT_INTERFACE_HPP__ +#define GLFFT_INTERFACE_HPP__ + +#include + +namespace GLFFT +{ + class Context; + + class Resource + { + public: + virtual ~Resource() = default; + + // Non-movable, non-copyable to make things simpler. + Resource(Resource&&) = delete; + void operator=(const Resource&) = delete; + + protected: + Resource() = default; + }; + + class Texture : public Resource {}; + class Sampler : public Resource {}; + class Buffer : public Resource {}; + + class Program + { + public: + virtual ~Program() = default; + protected: + friend class Context; + Program() = default; + }; + + enum AccessMode + { + AccessStreamCopy, + AccessStaticCopy, + AccessStreamRead + }; + + enum Format + { + FormatUnknown, + FormatR16G16B16A16Float, + FormatR32G32B32A32Float, + FormatR32G32Float, + FormatR32Float, + FormatR16G16Float, + FormatR32Uint + }; + + class CommandBuffer; + + class Context + { + public: + virtual ~Context() = default; + + virtual std::unique_ptr create_texture(const void *initial_data, + unsigned width, unsigned height, + Format format) = 0; + + virtual std::unique_ptr create_buffer(const void *initial_data, size_t size, AccessMode access) = 0; + virtual std::unique_ptr compile_compute_shader(const char *source) = 0; + + virtual CommandBuffer* request_command_buffer() = 0; + virtual void submit_command_buffer(CommandBuffer *cmd) = 0; + virtual void wait_idle() = 0; + + virtual const char* get_renderer_string() = 0; + virtual void log(const char *fmt, ...) = 0; + virtual double get_time() = 0; + + virtual unsigned get_max_work_group_threads() = 0; + + virtual const void* map(Buffer *buffer, size_t offset, size_t size) = 0; + virtual void unmap(Buffer *buffer) = 0; + + virtual bool supports_texture_readback() = 0; + virtual void read_texture(void *buffer, Texture *texture, Format format) = 0; + + protected: + Context() = default; + }; + + class CommandBuffer + { + public: + virtual ~CommandBuffer() = default; + + virtual void bind_program(Program *program) = 0; + virtual void bind_storage_texture(unsigned binding, Texture *texture, Format format) = 0; + virtual void bind_texture(unsigned binding, Texture *texture) = 0; + virtual void bind_sampler(unsigned binding, Sampler *sampler) = 0; + virtual void bind_storage_buffer(unsigned binding, Buffer *texture) = 0; + virtual void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) = 0; + virtual void dispatch(unsigned x, unsigned y, unsigned z) = 0; + + virtual void barrier(Buffer *buffer) = 0; + virtual void barrier(Texture *buffer) = 0; + virtual void barrier() = 0; + + enum { MaxConstantDataSize = 64 }; + virtual void push_constant_data(unsigned binding, const void *data, size_t size) = 0; + + protected: + CommandBuffer() = default; + }; +} + +#endif diff --git a/glfft/glfft_wisdom.cpp b/glfft/glfft_wisdom.cpp new file mode 100644 index 0000000..ad18317 --- /dev/null +++ b/glfft/glfft_wisdom.cpp @@ -0,0 +1,600 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "glfft_wisdom.hpp" +#include "glfft_interface.hpp" +#include "glfft.hpp" +#include + +/* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */ +#ifdef GLFFT_SERIALIZATION +#include "rapidjson/reader.h" +#include "rapidjson/prettywriter.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/document.h" +using namespace rapidjson; +#endif + +#ifdef GLFFT_CLI_ASYNC +#include "glfft_cli.hpp" +#endif + +using namespace std; +using namespace GLFFT; + +FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context) +{ + FFTStaticWisdom res; + + const char *renderer = context->get_renderer_string(); + unsigned threads = context->get_max_work_group_threads(); + + if (strstr(renderer, "GeForce") || strstr(renderer, "Quadro")) + { + context->log("Detected GeForce/Quadro GPU.\n"); + res.min_workgroup_size = 32; // Warp threads. + res.min_workgroup_size_shared = 32; + res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good. + res.min_vector_size = 2; + res.max_vector_size = 2; + res.shared_banked = FFTStaticWisdom::True; + } + else if (strstr(renderer, "Radeon")) + { + context->log("Detected Radeon GPU.\n"); + res.min_workgroup_size = 64; // Wavefront threads (GCN). + res.min_workgroup_size_shared = 128; + res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good. + // TODO: Find if we can restrict this to 2 or 4 always. + res.min_vector_size = 2; + res.max_vector_size = 4; + res.shared_banked = FFTStaticWisdom::True; + } + else if (strstr(renderer, "Mali")) + { + context->log("Detected Mali GPU.\n"); + + res.min_workgroup_size = 4; + res.min_workgroup_size_shared = 4; + res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea. + res.min_vector_size = 4; + res.max_vector_size = 4; + res.shared_banked = FFTStaticWisdom::False; + } + // TODO: Add more GPUs. + + return res; +} + +pair FFTWisdom::learn_optimal_options( + Context *context, unsigned Nx, unsigned Ny, unsigned radix, + Mode mode, Target input_target, Target output_target, + const FFTOptions::Type &type) +{ + WisdomPass pass = { + { + Nx, Ny, radix, mode, input_target, output_target, + type, + }, + 0.0, + }; + + auto itr = library.find(pass); + if (itr != end(library)) + { + return make_pair(itr->first.cost, itr->second); + } + else + { + auto result = study(context, pass, type); + pass.cost = result.first; + library[pass] = result.second; + + return result; + } +} + +void FFTWisdom::learn_optimal_options_exhaustive(Context *context, + unsigned Nx, unsigned Ny, + Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type) +{ + bool learn_resolve = type == ComplexToReal || type == RealToComplex; + Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical; + Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal; + + // Create wisdom for horizontal transforms and vertical transform. + static const unsigned radices[] = { 4, 8, 16, 64 }; + for (auto radix : radices) + { + try + { + // If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants. + + // Learn plain transforms. + if (Ny > 1) + { + learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type); + } + learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type); + + // Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with + // block interleave and larger WG_Y sizes. + if (input_target != SSBO) + { + if (Ny > 1) + { + learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type); + } + learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type); + } + + if (output_target != SSBO) + { + if (Ny > 1) + { + learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type); + } + learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type); + } + } +#ifdef GLFFT_CLI_ASYNC + catch (const AsyncCancellation &) + { + throw; + } +#endif + catch (...) + { + // If our default options cannot successfully create the radix pass (i.e. throws), + // just ignore it for purpose of creating wisdom. + } + } + + auto resolve_type = fft_type; + resolve_type.input_fp16 = resolve_type.output_fp16; + Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex; + Target resolve_input_target = SSBO; + + // If we have C2R Nx1 transform, the first pass is resolve, so use those types. + if (type == ComplexToReal && Ny == 1) + { + resolve_type = fft_type; + resolve_input_target = input_target; + } + + // If we need to do a resolve pass, train this case as well. + if (learn_resolve) + { + try + { + // If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target. + if (Ny == 1 && resolve_mode == ResolveRealToComplex) + { + learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type); + } + else + { + learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type); + } + } +#ifdef GLFFT_CLI_ASYNC + catch (const AsyncCancellation &) + { + throw; + } +#endif + catch (...) + { + // If our default options cannot successfully create the radix pass (i.e. throws), + // just ignore it for purpose of creating wisdom. + } + } +} + +double FFTWisdom::bench(Context *context, Resource *output, Resource *input, + const WisdomPass &pass, const FFTOptions &options, const shared_ptr &cache) const +{ + FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix, + pass.pass.mode, pass.pass.input_target, pass.pass.output_target, + cache, options); + + return fft.bench(context, + output, input, params.warmup, params.iterations, params.dispatches, params.timeout); +} + +static inline unsigned mode_to_size(Mode mode) +{ + switch (mode) + { + case VerticalDual: + case HorizontalDual: + case ResolveRealToComplex: + case ResolveComplexToReal: + return 4; + + default: + return 2; + } +} + +std::pair FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const +{ + auto cache = make_shared(); + + unique_ptr output; + unique_ptr input; + + unsigned mode_size = mode_to_size(pass.pass.mode); + vector tmp(mode_size * pass.pass.Nx * pass.pass.Ny); + + if (pass.pass.input_target == SSBO) + { + input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy); + } + else + { + Format format = FormatUnknown; + unsigned Nx = pass.pass.Nx; + unsigned Ny = pass.pass.Ny; + + switch (pass.pass.mode) + { + case VerticalDual: + case HorizontalDual: + format = FormatR32G32B32A32Float; + break; + + case Vertical: + case Horizontal: + format = FormatR32G32Float; + break; + + case ResolveComplexToReal: + format = FormatR32G32Float; + Nx *= 2; + break; + + default: + throw logic_error("Invalid input mode.\n"); + } + + input = context->create_texture(tmp.data(), Nx, Ny, format); + } + + if (pass.pass.output_target == SSBO) + { + output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy); + } + else + { + Format format = FormatUnknown; + unsigned Nx = pass.pass.Nx; + unsigned Ny = pass.pass.Ny; + + switch (pass.pass.mode) + { + case VerticalDual: + case HorizontalDual: + format = FormatR32G32B32A32Float; + break; + + case Vertical: + case Horizontal: + format = FormatR32G32Float; + break; + + case ResolveRealToComplex: + format = FormatR32G32Float; + Nx *= 2; + break; + + default: + throw logic_error("Invalid output mode.\n"); + } + + output = context->create_texture(nullptr, Nx, Ny, format); + } + + // Exhaustive search, look for every sensible combination, and find fastest parameters. + // Get initial best cost with defaults. + FFTOptions::Performance best_perf; + double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache); + + static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True }; + static const unsigned vector_size_values[] = { 2, 4, 8 }; + static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 }; + static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, }; + + bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex; + bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual; + unsigned bench_count = 0; + + for (auto shared_banked : shared_banked_values) + { + // Useless test, since shared banked is only relevant for radix 16/64. + if (pass.pass.radix < 16 && shared_banked) + { + continue; + } + + bool fair_shared_banked = (pass.pass.radix < 16) || + (static_wisdom.shared_banked == FFTStaticWisdom::DontCare) || + (shared_banked == static_wisdom.shared_banked); + + if (!fair_shared_banked) + { + continue; + } + + for (auto vector_size : vector_size_values) + { + // Resolve passes currently only support vector size 2. Shared banked makes no sense either. + if (test_resolve && (vector_size != 2 || shared_banked)) + { + continue; + } + + // We can only use vector_size 8 with FP16. + if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16)) + { + continue; + } + + // Makes little sense to test since since vector_size will be bumped to 4 anyways. + if (test_dual && vector_size < 4) + { + continue; + } + + for (auto workgroup_size_x : workgroup_size_x_values) + { + for (auto workgroup_size_y : workgroup_size_y_values) + { + unsigned workgroup_size = workgroup_size_x * workgroup_size_y; + + unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared : + static_wisdom.min_workgroup_size; + + unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size; + unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size; + + bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size && + workgroup_size >= min_workgroup_size; + if (pass.pass.Ny == 1 && workgroup_size_y > 1) + { + fair_workgroup_size = false; + } + + if (!fair_workgroup_size) + { + continue; + } + + // If we have dual mode, accept vector sizes larger than max. + bool fair_vector_size = test_resolve || (vector_size <= max_vector_size && + vector_size >= min_vector_size); + + if (!fair_vector_size) + { + continue; + } + + FFTOptions::Performance perf; + perf.shared_banked = shared_banked; + perf.vector_size = vector_size; + perf.workgroup_size_x = workgroup_size_x; + perf.workgroup_size_y = workgroup_size_y; + + try + { + // If workgroup sizes are too big for our test, this will throw. + double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache); + bench_count++; + +#if 1 + context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix); + context->log(" Width: %4u\n", pass.pass.Nx); + context->log(" Height: %4u\n", pass.pass.Ny); + context->log(" Shared banked: %3s\n", shared_banked ? "yes" : "no"); + context->log(" Vector size: %u\n", vector_size); + context->log(" Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y); + context->log(" Cost: %8.3g\n", cost); +#endif + + if (cost < minimum_cost) + { +#if 1 + context->log(" New optimal solution! (%g -> %g)\n", minimum_cost, cost); +#endif + best_perf = perf; + minimum_cost = cost; + } + } +#ifdef GLFFT_CLI_ASYNC + catch (const AsyncCancellation &) + { + throw; + } +#endif + catch (...) + { + // If we pass in bogus parameters, + // FFT will throw and we just ignore this. + } + } + } + } + } + + context->log("Tested %u variants!\n", bench_count); + return make_pair(minimum_cost, best_perf); +} + +const pair* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix, + Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const +{ + WisdomPass pass = { + { + Nx, Ny, radix, mode, input_target, output_target, + type, + }, + 0.0, + }; + + auto itr = library.find(pass); + return itr != end(library) ? (&(*itr)) : nullptr; +} + +const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix, + Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const +{ + WisdomPass pass = { + { + Nx, Ny, radix, mode, input_target, output_target, + base_options.type, + }, + 0.0, + }; + + auto itr = library.find(pass); + +#if 0 + if (itr == end(library)) + { + context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n", + Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target)); + } +#endif + + return itr != end(library) ? itr->second : base_options.performance; +} + +#ifdef GLFFT_SERIALIZATION +std::string FFTWisdom::archive() const +{ + StringBuffer s; + PrettyWriter writer{s}; + + writer.StartObject(); + writer.String("library"); + + // Serialize all wisdom accumulated to a string. + writer.StartArray(); + for (auto &entry : library) + { + writer.StartObject(); + + writer.String("scenario"); + writer.StartObject(); + writer.String("nx"); + writer.Uint(entry.first.pass.Nx); + writer.String("ny"); + writer.Uint(entry.first.pass.Ny); + writer.String("radix"); + writer.Uint(entry.first.pass.radix); + writer.String("mode"); + writer.Uint(entry.first.pass.mode); + writer.String("input_target"); + writer.Uint(entry.first.pass.input_target); + writer.String("output_target"); + writer.Uint(entry.first.pass.output_target); + writer.EndObject(); + + writer.String("type"); + writer.StartObject(); + writer.String("fp16"); + writer.Bool(entry.first.pass.type.fp16); + writer.String("input_fp16"); + writer.Bool(entry.first.pass.type.input_fp16); + writer.String("output_fp16"); + writer.Bool(entry.first.pass.type.output_fp16); + writer.String("normalize"); + writer.Bool(entry.first.pass.type.normalize); + writer.EndObject(); + + writer.String("performance"); + writer.StartObject(); + writer.String("shared_banked"); + writer.Bool(entry.second.shared_banked); + writer.String("vector_size"); + writer.Uint(entry.second.vector_size); + writer.String("workgroup_size_x"); + writer.Uint(entry.second.workgroup_size_x); + writer.String("workgroup_size_y"); + writer.Uint(entry.second.workgroup_size_y); + writer.EndObject(); + + writer.String("cost"); + writer.Double(entry.first.cost); + + writer.EndObject(); + } + writer.EndArray(); + writer.EndObject(); + return s.GetString(); +} + +void FFTWisdom::extract(const char *json) +{ + Document document; + document.Parse(json); + + // Exception safe, we don't want to risk throwing in the middle of the + // loop, leaving the library is broken state. + unordered_map new_library; + + auto &lib = document["library"]; + + // y u no begin(), end() :( + for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr) + { + auto &v = *itr; + + WisdomPass pass; + FFTOptions::Performance perf; + + pass.cost = v["cost"].GetDouble(); + + auto &scenario = v["scenario"]; + pass.pass.Nx = scenario["nx"].GetUint(); + pass.pass.Ny = scenario["ny"].GetUint(); + pass.pass.radix = scenario["radix"].GetUint(); + pass.pass.mode = static_cast(scenario["mode"].GetUint()); + pass.pass.input_target = static_cast(scenario["input_target"].GetUint()); + pass.pass.output_target = static_cast(scenario["output_target"].GetUint()); + + auto &type = v["type"]; + pass.pass.type.fp16 = type["fp16"].GetBool(); + pass.pass.type.input_fp16 = type["input_fp16"].GetBool(); + pass.pass.type.output_fp16 = type["output_fp16"].GetBool(); + pass.pass.type.normalize = type["normalize"].GetBool(); + + auto &performance = v["performance"]; + perf.shared_banked = performance["shared_banked"].GetBool(); + perf.vector_size = performance["vector_size"].GetUint(); + perf.workgroup_size_x = performance["workgroup_size_x"].GetUint(); + perf.workgroup_size_y = performance["workgroup_size_y"].GetUint(); + + new_library[pass] = perf; + } + + // Exception safe. + swap(library, new_library); +} +#endif + diff --git a/glfft/glfft_wisdom.hpp b/glfft/glfft_wisdom.hpp new file mode 100644 index 0000000..d2a11ee --- /dev/null +++ b/glfft/glfft_wisdom.hpp @@ -0,0 +1,149 @@ +/* Copyright (C) 2015 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef GLFFT_WISDOM_HPP__ +#define GLFFT_WISDOM_HPP__ + +#include +#include +#include +#include "glfft_common.hpp" +#include "glfft_interface.hpp" + +namespace GLFFT +{ + +struct WisdomPass +{ + struct + { + unsigned Nx; + unsigned Ny; + unsigned radix; + Mode mode; + Target input_target; + Target output_target; + FFTOptions::Type type; + } pass; + + double cost; + + bool operator==(const WisdomPass &other) const + { + return std::memcmp(&pass, &other.pass, sizeof(pass)) == 0; + } +}; + +} + +namespace std +{ + template<> + struct hash + { + std::size_t operator()(const GLFFT::WisdomPass ¶ms) const + { + std::size_t h = 0; + hash hasher; + for (std::size_t i = 0; i < sizeof(params.pass); i++) + { + h ^= hasher(reinterpret_cast(¶ms.pass)[i]); + } + + return h; + } + }; +} + +namespace GLFFT +{ + +// Adds information which depends on the GPU vendor. +// This can speed up learning process, since there will be fewer "obviously wrong" settings to test. +struct FFTStaticWisdom +{ + enum Tristate { True = 1, False = 0, DontCare = -1 }; + + unsigned min_workgroup_size = 1; + unsigned min_workgroup_size_shared = 1; + unsigned max_workgroup_size = 128; // GLES 3.1 mandates support for this. + unsigned min_vector_size = 2; + unsigned max_vector_size = 4; + Tristate shared_banked = DontCare; +}; + +class FFTWisdom +{ + public: + std::pair learn_optimal_options(Context *ctx, + unsigned Nx, unsigned Ny, unsigned radix, + Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type); + + void learn_optimal_options_exhaustive(Context *ctx, + unsigned Nx, unsigned Ny, + Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type); + + const std::pair* find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix, + Mode mode, Target input_target, Target output_target, const FFTOptions::Type &base_options) const; + + const FFTOptions::Performance& find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix, + Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const; + + void set_static_wisdom(FFTStaticWisdom static_wisdom) { this->static_wisdom = static_wisdom; } + static FFTStaticWisdom get_static_wisdom_from_renderer(Context *context); + + void set_bench_params(unsigned warmup, + unsigned iterations, unsigned dispatches, double timeout) + { + params.warmup = warmup; + params.iterations = iterations; + params.dispatches = dispatches; + params.timeout = timeout; + } + +#ifdef GLFFT_SERIALIZATION + // Serialization interface. + std::string archive() const; + void extract(const char *json); +#endif + + private: + std::unordered_map library; + + std::pair study(Context *context, + const WisdomPass &pass, FFTOptions::Type options) const; + + double bench(Context *cmd, Resource *output, Resource *input, + const WisdomPass &pass, const FFTOptions &options, + const std::shared_ptr &cache) const; + + FFTStaticWisdom static_wisdom; + + struct + { + unsigned warmup = 2; + unsigned iterations = 20; + unsigned dispatches = 50; + double timeout = 1.0; + } params; +}; + +} + +#endif +