Add GLFFT
This commit is contained in:
19
glfft/LICENSE_ORIGINAL
Normal file
19
glfft/LICENSE_ORIGINAL
Normal file
@@ -0,0 +1,19 @@
|
||||
Copyright (c) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
1125
glfft/glfft.cpp
Normal file
1125
glfft/glfft.cpp
Normal file
File diff suppressed because it is too large
Load Diff
225
glfft/glfft.hpp
Normal file
225
glfft/glfft.hpp
Normal file
@@ -0,0 +1,225 @@
|
||||
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
*
|
||||
* Permission is hereby granted, free of charge,
|
||||
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef GLFFT_HPP__
|
||||
#define GLFFT_HPP__
|
||||
|
||||
#include "glfft_interface.hpp"
|
||||
#include "glfft_common.hpp"
|
||||
#include "glfft_wisdom.hpp"
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <limits>
|
||||
|
||||
/// GLFFT doesn't try to preserve GL state in any way.
|
||||
/// E.g. SHADER_STORAGE_BUFFER bindings, programs bound, texture bindings, etc.
|
||||
/// Applications calling this library must expect that some GL state will be modified.
|
||||
/// No rendering state associated with graphics will be modified.
|
||||
|
||||
namespace GLFFT
|
||||
{
|
||||
|
||||
class FFT
|
||||
{
|
||||
public:
|
||||
/// @brief Creates a full FFT.
|
||||
///
|
||||
/// All buffer allocation done by GLFFT will be done in constructor.
|
||||
/// Will throw if invalid parameters are passed.
|
||||
///
|
||||
/// @param context The graphics context.
|
||||
/// @param Nx Number of samples in horizontal dimension.
|
||||
/// @param Ny Number of samples in vertical dimension.
|
||||
/// @param type The transform type.
|
||||
/// @param direction Forward, inverse or inverse with convolution.
|
||||
/// For real-to-complex and complex-to-real transforms, the
|
||||
/// transform type must match.
|
||||
/// @param input_target GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
|
||||
/// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
|
||||
/// @param cache A program cache for caching the GLFFT programs created.
|
||||
/// @param options FFT options such as performance related parameters and types.
|
||||
/// @param wisdom GLFFT wisdom which can override performance related options
|
||||
/// (options.performance is used as a fallback).
|
||||
FFT(Context *context, unsigned Nx, unsigned Ny,
|
||||
Type type, Direction direction, Target input_target, Target output_target,
|
||||
std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
|
||||
const FFTWisdom &wisdom = FFTWisdom());
|
||||
|
||||
/// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
|
||||
///
|
||||
/// All buffer allocation done by GLFFT will be done in constructor.
|
||||
/// Will throw if invalid parameters are passed.
|
||||
///
|
||||
/// @param context The graphics context.
|
||||
/// @param Nx Number of samples in horizontal dimension.
|
||||
/// @param Ny Number of samples in vertical dimension.
|
||||
/// @param radix FFT radix to test.
|
||||
/// @param p Accumulated p factor. If 1, "first pass" mode is tested, otherwise, generic FFT stages.
|
||||
/// @param mode The transform mode.
|
||||
/// @param input_target GL object type of input target. For real-to-complex with texture as input, ImageReal is used.
|
||||
/// @param output_target GL object type of output target. For complex-to-real with texture as output, ImageReal is used.
|
||||
/// @param cache A program cache for caching the GLFFT programs created.
|
||||
/// @param options FFT options such as performance related parameters and types.
|
||||
FFT(Context *context, unsigned Nx, unsigned Ny, unsigned radix, unsigned p,
|
||||
Mode mode, Target input_target, Target output_target,
|
||||
std::shared_ptr<ProgramCache> cache, const FFTOptions &options);
|
||||
|
||||
/// @brief Process the FFT.
|
||||
///
|
||||
/// The type of object passed here must match what FFT was initialized with.
|
||||
///
|
||||
/// @param cmd Command buffer for issuing dispatch commands.
|
||||
/// @param output Output buffer or image.
|
||||
/// NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
|
||||
/// @param input Input buffer or texture.
|
||||
/// @param input_aux If using convolution transform type,
|
||||
/// the content of input and input_aux will be multiplied together.
|
||||
void process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux = nullptr);
|
||||
|
||||
/// @brief Run process() multiple times, timing the results.
|
||||
///
|
||||
/// Mostly used internally by GLFFT wisdom, glfft_cli's bench, and so on.
|
||||
///
|
||||
/// @param context The graphics context.
|
||||
/// @param output Output buffer or image.
|
||||
/// NOTE: For images, the texture must be using immutable storage, i.e. glTexStorage2D!
|
||||
/// @param input Input buffer or texture.
|
||||
/// @param warmup_iterations Number of iterations to run to "warm" up GL, ensures we don't hit
|
||||
/// recompilations or similar when benching.
|
||||
/// @param iterations Number of iterations to run the benchmark.
|
||||
/// Each iteration will ensure timing with a glFinish() followed by timing.
|
||||
/// @param dispatches_per_iteration Number of calls to process() we should do per iteration.
|
||||
/// @param max_time The max time the benchmark should run. Will be checked after each iteration is complete.
|
||||
///
|
||||
/// @returns Average GPU time per process() call.
|
||||
double bench(Context *context, Resource *output, Resource *input,
|
||||
unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration,
|
||||
double max_time = std::numeric_limits<double>::max());
|
||||
|
||||
/// @brief Returns cost for a process() call. Only used for debugging.
|
||||
double get_cost() const { return cost; }
|
||||
|
||||
/// @brief Returns number of passes (glDispatchCompute) in a process() call.
|
||||
unsigned get_num_passes() const { return passes.size(); }
|
||||
|
||||
/// @brief Returns Nx.
|
||||
unsigned get_dimension_x() const { return size_x; }
|
||||
/// @brief Returns Ny.
|
||||
unsigned get_dimension_y() const { return size_y; }
|
||||
|
||||
/// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures.
|
||||
///
|
||||
/// By default, these values are 0.5 / size (samples in the center of texel (0, 0)).
|
||||
/// Scale is 1.0 / size, so it steps one texel for each coordinate in the FFT transform.
|
||||
/// Setting this to something custom is useful to get downsampling with GL_LINEAR -> FFT transform
|
||||
/// without having to downsample the texture first, then FFT.
|
||||
void set_texture_offset_scale(float offset_x, float offset_y, float scale_x, float scale_y)
|
||||
{
|
||||
texture.offset_x = offset_x;
|
||||
texture.offset_y = offset_y;
|
||||
texture.scale_x = scale_x;
|
||||
texture.scale_y = scale_y;
|
||||
}
|
||||
|
||||
/// @brief Set binding range for input.
|
||||
///
|
||||
/// If input is an SSBO, set a custom binding range to be passed to glBindBufferRange.
|
||||
/// By default, the entire buffer is bound.
|
||||
void set_input_buffer_range(size_t offset, size_t size)
|
||||
{
|
||||
ssbo.input.offset = offset;
|
||||
ssbo.input.size = size;
|
||||
}
|
||||
|
||||
/// @brief Set binding range for input_aux.
|
||||
///
|
||||
/// If input_aux is an SSBO, set a custom binding range to be passed to glBindBufferRange.
|
||||
/// By default, the entire buffer is bound.
|
||||
void set_input_aux_buffer_range(size_t offset, size_t size)
|
||||
{
|
||||
ssbo.input_aux.offset = offset;
|
||||
ssbo.input_aux.size = size;
|
||||
}
|
||||
|
||||
/// @brief Set binding range for output.
|
||||
///
|
||||
/// If output buffer is an SSBO, set a custom binding range to be passed to glBindBufferRange.
|
||||
/// By default, the entire buffer is bound.
|
||||
void set_output_buffer_range(size_t offset, size_t size)
|
||||
{
|
||||
ssbo.output.offset = offset;
|
||||
ssbo.output.size = size;
|
||||
}
|
||||
|
||||
/// @brief Set samplers for input textures.
|
||||
///
|
||||
/// Set sampler objects to be used for input and input_aux if textures are used as input.
|
||||
/// By default, sampler object 0 will be used (inheriting sampler parameters from the texture object itself).
|
||||
void set_samplers(Sampler *sampler0, Sampler *sampler1 = nullptr)
|
||||
{
|
||||
texture.samplers[0] = sampler0;
|
||||
texture.samplers[1] = sampler1;
|
||||
}
|
||||
|
||||
private:
|
||||
Context *context;
|
||||
|
||||
struct Pass
|
||||
{
|
||||
Parameters parameters;
|
||||
|
||||
unsigned workgroups_x;
|
||||
unsigned workgroups_y;
|
||||
unsigned uv_scale_x;
|
||||
unsigned stride;
|
||||
Program *program;
|
||||
};
|
||||
|
||||
double cost = 0.0;
|
||||
|
||||
std::unique_ptr<Buffer> temp_buffer;
|
||||
std::unique_ptr<Buffer> temp_buffer_image;
|
||||
std::vector<Pass> passes;
|
||||
std::shared_ptr<ProgramCache> cache;
|
||||
|
||||
std::unique_ptr<Program> build_program(const Parameters ¶ms);
|
||||
static std::string load_shader_string(const char *path);
|
||||
static void store_shader_string(const char *path, const std::string &source);
|
||||
|
||||
Program* get_program(const Parameters ¶ms);
|
||||
|
||||
struct
|
||||
{
|
||||
float offset_x = 0.0f, offset_y = 0.0f, scale_x = 1.0f, scale_y = 1.0f;
|
||||
Sampler *samplers[2] = { nullptr, nullptr };
|
||||
} texture;
|
||||
|
||||
struct
|
||||
{
|
||||
struct
|
||||
{
|
||||
size_t offset = 0;
|
||||
size_t size = 0;
|
||||
} input, input_aux, output;
|
||||
} ssbo;
|
||||
unsigned size_x, size_y;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
178
glfft/glfft_common.hpp
Normal file
178
glfft/glfft_common.hpp
Normal file
@@ -0,0 +1,178 @@
|
||||
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
*
|
||||
* Permission is hereby granted, free of charge,
|
||||
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// For the most part used by the implementation.
|
||||
|
||||
#ifndef GLFFT_COMMON_HPP__
|
||||
#define GLFFT_COMMON_HPP__
|
||||
|
||||
#include "glfft_interface.hpp"
|
||||
#include <functional>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace GLFFT
|
||||
{
|
||||
|
||||
enum Direction
|
||||
{
|
||||
/// Forward FFT transform.
|
||||
Forward = -1,
|
||||
/// Inverse FFT transform, but with two inputs (in frequency domain) which are multiplied together
|
||||
/// for convolution.
|
||||
InverseConvolve = 0,
|
||||
/// Inverse FFT transform.
|
||||
Inverse = 1
|
||||
};
|
||||
|
||||
enum Mode
|
||||
{
|
||||
Horizontal,
|
||||
HorizontalDual,
|
||||
Vertical,
|
||||
VerticalDual,
|
||||
|
||||
ResolveRealToComplex,
|
||||
ResolveComplexToReal,
|
||||
};
|
||||
|
||||
enum Type
|
||||
{
|
||||
/// Regular complex-to-complex transform.
|
||||
ComplexToComplex,
|
||||
/// Complex-to-complex dual transform where the complex value is four-dimensional,
|
||||
/// i.e. a vector of two complex values. Typically used to transform RGBA data.
|
||||
ComplexToComplexDual,
|
||||
/// Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex samples.
|
||||
ComplexToReal,
|
||||
/// Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N complex samples.
|
||||
RealToComplex
|
||||
};
|
||||
|
||||
enum Target
|
||||
{
|
||||
/// GL_SHADER_STORAGE_BUFFER
|
||||
SSBO,
|
||||
/// Textures, when used as output, type is determined by transform type.
|
||||
/// ComplexToComplex / RealToComplex -> GL_RG16F
|
||||
/// ComplexToComplexDual -> GL_RGBA16F
|
||||
Image,
|
||||
/// Real-valued (single component) textures, when used as output, type is determined by transform type.
|
||||
/// ComplexToReal -> GL_R32F (because GLES 3.1 doesn't have GL_R16F image type).
|
||||
ImageReal
|
||||
};
|
||||
|
||||
struct Parameters
|
||||
{
|
||||
unsigned workgroup_size_x;
|
||||
unsigned workgroup_size_y;
|
||||
unsigned workgroup_size_z;
|
||||
unsigned radix;
|
||||
unsigned vector_size;
|
||||
Direction direction;
|
||||
Mode mode;
|
||||
Target input_target;
|
||||
Target output_target;
|
||||
bool p1;
|
||||
bool shared_banked;
|
||||
bool fft_fp16, input_fp16, output_fp16;
|
||||
bool fft_normalize;
|
||||
|
||||
bool operator==(const Parameters &other) const
|
||||
{
|
||||
return std::memcmp(this, &other, sizeof(Parameters)) == 0;
|
||||
}
|
||||
};
|
||||
|
||||
/// @brief Options for FFT implementation.
|
||||
/// Defaults for performance as conservative.
|
||||
struct FFTOptions
|
||||
{
|
||||
struct Performance
|
||||
{
|
||||
/// Workgroup size used in layout(local_size_x).
|
||||
/// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
|
||||
/// FFT constructor will throw in this case.
|
||||
unsigned workgroup_size_x = 4;
|
||||
/// Workgroup size used in layout(local_size_x).
|
||||
/// Only affects performance, however, large values may make implementations of smaller sized FFTs impossible.
|
||||
/// FFT constructor will throw in this case.
|
||||
unsigned workgroup_size_y = 1;
|
||||
/// Vector size. Very GPU dependent. "Scalar" GPUs prefer 2 here, vector GPUs prefer 4 (and maybe 8).
|
||||
unsigned vector_size = 2;
|
||||
/// Whether to use banked shared memory or not.
|
||||
/// Desktop GPUs prefer true here, false for mobile in general.
|
||||
bool shared_banked = false;
|
||||
} performance;
|
||||
|
||||
struct Type
|
||||
{
|
||||
/// Whether internal shader should be mediump float.
|
||||
bool fp16 = false;
|
||||
/// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
|
||||
bool input_fp16 = false;
|
||||
/// Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
|
||||
bool output_fp16 = false;
|
||||
/// Whether to apply 1 / N normalization factor.
|
||||
bool normalize = false;
|
||||
} type;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<>
|
||||
struct hash<GLFFT::Parameters>
|
||||
{
|
||||
std::size_t operator()(const GLFFT::Parameters ¶ms) const
|
||||
{
|
||||
std::size_t h = 0;
|
||||
hash<uint8_t> hasher;
|
||||
for (std::size_t i = 0; i < sizeof(GLFFT::Parameters); i++)
|
||||
{
|
||||
h ^= hasher(reinterpret_cast<const uint8_t*>(¶ms)[i]);
|
||||
}
|
||||
|
||||
return h;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace GLFFT
|
||||
{
|
||||
|
||||
class ProgramCache
|
||||
{
|
||||
public:
|
||||
Program* find_program(const Parameters ¶meters) const;
|
||||
void insert_program(const Parameters ¶meters, std::unique_ptr<Program> program);
|
||||
size_t cache_size() const { return programs.size(); }
|
||||
|
||||
private:
|
||||
std::unordered_map<Parameters, std::unique_ptr<Program>> programs;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
6
glfft/glfft_gl_api_headers.hpp
Normal file
6
glfft/glfft_gl_api_headers.hpp
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
/* Let GLFFT use GLava's headers */
|
||||
#define GLFFT_GLSL_LANG_STRING "#version 430 core\n"
|
||||
extern "C" {
|
||||
#include "../glava/glad.h"
|
||||
}
|
||||
310
glfft/glfft_gl_interface.cpp
Normal file
310
glfft/glfft_gl_interface.cpp
Normal file
@@ -0,0 +1,310 @@
|
||||
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
*
|
||||
* Permission is hereby granted, free of charge,
|
||||
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "glfft_gl_interface.hpp"
|
||||
#ifdef GLFFT_GL_DEBUG
|
||||
#include "glfft_validate.hpp"
|
||||
#endif
|
||||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
using namespace GLFFT;
|
||||
using namespace std;
|
||||
|
||||
GLCommandBuffer GLContext::static_command_buffer;
|
||||
|
||||
void GLCommandBuffer::bind_program(Program *program)
|
||||
{
|
||||
glUseProgram(program ? static_cast<GLProgram*>(program)->name : 0);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::bind_storage_texture(unsigned binding, Texture *texture, Format format)
|
||||
{
|
||||
glBindImageTexture(binding, static_cast<GLTexture*>(texture)->name,
|
||||
0, GL_FALSE, 0, GL_WRITE_ONLY, convert(format));
|
||||
}
|
||||
|
||||
void GLCommandBuffer::bind_texture(unsigned binding, Texture *texture)
|
||||
{
|
||||
glActiveTexture(GL_TEXTURE0 + binding);
|
||||
glBindTexture(GL_TEXTURE_2D, static_cast<GLTexture*>(texture)->name);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::bind_sampler(unsigned binding, Sampler *sampler)
|
||||
{
|
||||
glBindSampler(binding, sampler ? static_cast<GLSampler*>(sampler)->name : 0);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::bind_storage_buffer(unsigned binding, Buffer *buffer)
|
||||
{
|
||||
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::bind_storage_buffer_range(unsigned binding, size_t offset, size_t size, Buffer *buffer)
|
||||
{
|
||||
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, static_cast<GLBuffer*>(buffer)->name, offset, size);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::dispatch(unsigned x, unsigned y, unsigned z)
|
||||
{
|
||||
glDispatchCompute(x, y, z);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::barrier(Buffer*)
|
||||
{
|
||||
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::barrier(Texture*)
|
||||
{
|
||||
glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::barrier()
|
||||
{
|
||||
glMemoryBarrier(GL_ALL_BARRIER_BITS);
|
||||
}
|
||||
|
||||
void GLCommandBuffer::push_constant_data(unsigned binding, const void *data, size_t size)
|
||||
{
|
||||
glBindBufferBase(GL_UNIFORM_BUFFER, binding, ubos[ubo_index]);
|
||||
void *ptr = glMapBufferRange(GL_UNIFORM_BUFFER,
|
||||
0, CommandBuffer::MaxConstantDataSize,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
|
||||
|
||||
if (ptr)
|
||||
{
|
||||
std::memcpy(ptr, data, size);
|
||||
glUnmapBuffer(GL_UNIFORM_BUFFER);
|
||||
}
|
||||
|
||||
if (++ubo_index >= ubo_count)
|
||||
ubo_index = 0;
|
||||
}
|
||||
|
||||
CommandBuffer* GLContext::request_command_buffer()
|
||||
{
|
||||
if (!initialized_ubos)
|
||||
{
|
||||
glGenBuffers(MaxBuffersRing, ubos);
|
||||
for (auto &ubo : ubos)
|
||||
{
|
||||
glBindBuffer(GL_UNIFORM_BUFFER, ubo);
|
||||
glBufferData(GL_UNIFORM_BUFFER, CommandBuffer::MaxConstantDataSize, nullptr, GL_STREAM_DRAW);
|
||||
}
|
||||
static_command_buffer.set_constant_data_buffers(ubos, MaxBuffersRing);
|
||||
initialized_ubos = true;
|
||||
}
|
||||
return &static_command_buffer;
|
||||
}
|
||||
|
||||
void GLContext::submit_command_buffer(CommandBuffer*)
|
||||
{}
|
||||
|
||||
void GLContext::wait_idle()
|
||||
{
|
||||
glFinish();
|
||||
}
|
||||
|
||||
unique_ptr<Texture> GLContext::create_texture(const void *initial_data,
|
||||
unsigned width, unsigned height,
|
||||
Format format)
|
||||
{
|
||||
return unique_ptr<Texture>(new GLTexture(initial_data, width, height, format));
|
||||
}
|
||||
|
||||
unique_ptr<Buffer> GLContext::create_buffer(const void *initial_data, size_t size, AccessMode access)
|
||||
{
|
||||
return unique_ptr<Buffer>(new GLBuffer(initial_data, size, access));
|
||||
}
|
||||
|
||||
unique_ptr<Program> GLContext::compile_compute_shader(const char *source)
|
||||
{
|
||||
#ifdef GLFFT_GL_DEBUG
|
||||
if (!validate_glsl_source(source))
|
||||
return nullptr;
|
||||
#endif
|
||||
|
||||
GLuint program = glCreateProgram();
|
||||
if (!program)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
|
||||
|
||||
const char *sources[] = { GLFFT_GLSL_LANG_STRING, source };
|
||||
glShaderSource(shader, 2, sources, NULL);
|
||||
glCompileShader(shader);
|
||||
|
||||
GLint status;
|
||||
glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
|
||||
if (status == GL_FALSE)
|
||||
{
|
||||
GLint len;
|
||||
GLsizei out_len;
|
||||
|
||||
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len);
|
||||
vector<char> buf(len);
|
||||
glGetShaderInfoLog(shader, len, &out_len, buf.data());
|
||||
log("GLFFT: Shader log:\n%s\n\n", buf.data());
|
||||
|
||||
glDeleteShader(shader);
|
||||
glDeleteProgram(program);
|
||||
return 0;
|
||||
}
|
||||
|
||||
glAttachShader(program, shader);
|
||||
glLinkProgram(program);
|
||||
glDeleteShader(shader);
|
||||
|
||||
glGetProgramiv(program, GL_LINK_STATUS, &status);
|
||||
if (status == GL_FALSE)
|
||||
{
|
||||
GLint len;
|
||||
GLsizei out_len;
|
||||
glGetProgramiv(program, GL_INFO_LOG_LENGTH, &len);
|
||||
vector<char> buf(len);
|
||||
glGetProgramInfoLog(program, len, &out_len, buf.data());
|
||||
log("Program log:\n%s\n\n", buf.data());
|
||||
|
||||
glDeleteProgram(program);
|
||||
glDeleteShader(shader);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return unique_ptr<Program>(new GLProgram(program));
|
||||
}
|
||||
|
||||
void GLContext::log(const char *fmt, ...)
|
||||
{
|
||||
char buffer[4 * 1024];
|
||||
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
vsnprintf(buffer, sizeof(buffer), fmt, va);
|
||||
va_end(va);
|
||||
glfft_log("%s", buffer);
|
||||
}
|
||||
|
||||
double GLContext::get_time()
|
||||
{
|
||||
return glfft_time();
|
||||
}
|
||||
|
||||
unsigned GLContext::get_max_work_group_threads()
|
||||
{
|
||||
GLint value;
|
||||
glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &value);
|
||||
return value;
|
||||
}
|
||||
|
||||
const char* GLContext::get_renderer_string()
|
||||
{
|
||||
return reinterpret_cast<const char*>(glGetString(GL_RENDERER));
|
||||
}
|
||||
|
||||
const void* GLContext::map(Buffer *buffer, size_t offset, size_t size)
|
||||
{
|
||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
|
||||
const void *ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, size, GL_MAP_READ_BIT);
|
||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void GLContext::unmap(Buffer *buffer)
|
||||
{
|
||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, static_cast<GLBuffer*>(buffer)->name);
|
||||
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
|
||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
|
||||
}
|
||||
|
||||
void GLContext::teardown()
|
||||
{
|
||||
if (initialized_ubos)
|
||||
glDeleteBuffers(MaxBuffersRing, ubos);
|
||||
initialized_ubos = false;
|
||||
}
|
||||
|
||||
GLContext::~GLContext()
|
||||
{
|
||||
teardown();
|
||||
}
|
||||
|
||||
GLTexture::GLTexture(const void *initial_data,
|
||||
unsigned width, unsigned height,
|
||||
Format format)
|
||||
{
|
||||
glGenTextures(1, &name);
|
||||
glBindTexture(GL_TEXTURE_2D, name);
|
||||
glTexStorage2D(GL_TEXTURE_2D, 1, convert(format), width, height);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||
|
||||
if (initial_data)
|
||||
{
|
||||
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height,
|
||||
convert_format(format), convert_type(format), initial_data);
|
||||
}
|
||||
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
}
|
||||
|
||||
GLTexture::~GLTexture()
|
||||
{
|
||||
if (owned)
|
||||
glDeleteTextures(1, &name);
|
||||
}
|
||||
|
||||
GLBuffer::GLBuffer(const void *initial_data, size_t size, AccessMode access)
|
||||
{
|
||||
glGenBuffers(1, &name);
|
||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, name);
|
||||
glBufferData(GL_SHADER_STORAGE_BUFFER, size, initial_data, convert(access));
|
||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
|
||||
}
|
||||
|
||||
GLBuffer::~GLBuffer()
|
||||
{
|
||||
if (owned)
|
||||
glDeleteBuffers(1, &name);
|
||||
}
|
||||
|
||||
GLProgram::GLProgram(GLuint name)
|
||||
: name(name)
|
||||
{}
|
||||
|
||||
GLProgram::~GLProgram()
|
||||
{
|
||||
if (name != 0)
|
||||
{
|
||||
glDeleteProgram(name);
|
||||
}
|
||||
}
|
||||
|
||||
GLSampler::~GLSampler()
|
||||
{
|
||||
if (name != 0)
|
||||
{
|
||||
glDeleteSamplers(1, &name);
|
||||
}
|
||||
}
|
||||
|
||||
258
glfft/glfft_gl_interface.hpp
Normal file
258
glfft/glfft_gl_interface.hpp
Normal file
@@ -0,0 +1,258 @@
|
||||
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
*
|
||||
* Permission is hereby granted, free of charge,
|
||||
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef GLFFT_GL_INTERFACE_HPP__
|
||||
#define GLFFT_GL_INTERFACE_HPP__
|
||||
|
||||
#include "glfft_interface.hpp"
|
||||
|
||||
#include "glfft_gl_api_headers.hpp"
|
||||
|
||||
/* GLava additions (POSIX) */
|
||||
extern "C" {
|
||||
#include <time.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <error.h>
|
||||
}
|
||||
|
||||
#ifndef GLFFT_GLSL_LANG_STRING
|
||||
#error GLFFT_GLSL_LANG_STRING must be defined to e.g. "#version 310 es\n" or "#version 430 core\n".
|
||||
#endif
|
||||
|
||||
#ifndef GLFFT_LOG_OVERRIDE
|
||||
void glfft_log(const char *fmt, ...) {
|
||||
va_list l;
|
||||
va_start(l, fmt);
|
||||
vfprintf(stdout, fmt, l);
|
||||
va_end(l);
|
||||
}
|
||||
#else
|
||||
#define glfft_log GLFFT_LOG_OVERRIDE
|
||||
#endif
|
||||
|
||||
#ifndef GLFFT_TIME_OVERRIDE
|
||||
double glfft_time() {
|
||||
struct timespec tv;
|
||||
if (clock_gettime(CLOCK_REALTIME, &tv)) {
|
||||
fprintf(stderr, "clock_gettime(CLOCK_REALTIME, ...): %s\n", strerror(errno));
|
||||
}
|
||||
return (double) tv.tv_sec + ((double) tv.tv_nsec / 1000000000.0);
|
||||
}
|
||||
#else
|
||||
#define glfft_time GLFFT_TIME_OVERRIDE
|
||||
#endif
|
||||
|
||||
namespace GLFFT
|
||||
{
|
||||
class GLContext;
|
||||
|
||||
class GLTexture : public Texture
|
||||
{
|
||||
public:
|
||||
friend class GLContext;
|
||||
friend class GLCommandBuffer;
|
||||
~GLTexture();
|
||||
|
||||
GLTexture(GLuint obj) : name(obj), owned(false) {}
|
||||
GLuint get() const { return name; }
|
||||
|
||||
private:
|
||||
GLTexture(const void *initial_data,
|
||||
unsigned width, unsigned height,
|
||||
Format format);
|
||||
GLuint name;
|
||||
bool owned = true;
|
||||
};
|
||||
|
||||
// Not really used by test and bench code, but can be useful for API users.
|
||||
class GLSampler : public Sampler
|
||||
{
|
||||
public:
|
||||
friend class GLContext;
|
||||
friend class GLCommandBuffer;
|
||||
~GLSampler();
|
||||
|
||||
GLSampler(GLuint obj) : name(obj) {}
|
||||
GLuint get() const { return name; }
|
||||
|
||||
private:
|
||||
GLuint name;
|
||||
};
|
||||
|
||||
class GLBuffer : public Buffer
|
||||
{
|
||||
public:
|
||||
friend class GLContext;
|
||||
friend class GLCommandBuffer;
|
||||
~GLBuffer();
|
||||
|
||||
GLBuffer(GLuint obj) : name(obj), owned(false) {}
|
||||
GLuint get() const { return name; }
|
||||
|
||||
private:
|
||||
GLuint name;
|
||||
GLBuffer(const void *initial_data, size_t size, AccessMode access);
|
||||
bool owned = true;
|
||||
};
|
||||
|
||||
class GLProgram : public Program
|
||||
{
|
||||
public:
|
||||
friend class GLContext;
|
||||
friend class GLCommandBuffer;
|
||||
~GLProgram();
|
||||
|
||||
GLuint get() const { return name; }
|
||||
|
||||
private:
|
||||
GLProgram(GLuint name);
|
||||
GLuint name;
|
||||
};
|
||||
|
||||
class GLCommandBuffer : public CommandBuffer
|
||||
{
|
||||
public:
|
||||
~GLCommandBuffer() = default;
|
||||
|
||||
void set_constant_data_buffers(const GLuint *ubos, unsigned count)
|
||||
{
|
||||
this->ubos = ubos;
|
||||
ubo_index = 0;
|
||||
ubo_count = count;
|
||||
}
|
||||
|
||||
void bind_program(Program *program) override;
|
||||
void bind_storage_texture(unsigned binding, Texture *texture, Format format) override;
|
||||
void bind_texture(unsigned binding, Texture *texture) override;
|
||||
void bind_sampler(unsigned binding, Sampler *sampler) override;
|
||||
void bind_storage_buffer(unsigned binding, Buffer *texture) override;
|
||||
void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) override;
|
||||
void dispatch(unsigned x, unsigned y, unsigned z) override;
|
||||
|
||||
void barrier(Buffer *buffer) override;
|
||||
void barrier(Texture *buffer) override;
|
||||
void barrier() override;
|
||||
|
||||
void push_constant_data(unsigned binding, const void *data, size_t size) override;
|
||||
|
||||
private:
|
||||
const GLuint *ubos = nullptr;
|
||||
unsigned ubo_count = 0;
|
||||
unsigned ubo_index = 0;
|
||||
};
|
||||
|
||||
class GLContext : public Context
|
||||
{
|
||||
public:
|
||||
~GLContext();
|
||||
|
||||
std::unique_ptr<Texture> create_texture(const void *initial_data,
|
||||
unsigned width, unsigned height,
|
||||
Format format) override;
|
||||
|
||||
std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) override;
|
||||
std::unique_ptr<Program> compile_compute_shader(const char *source) override;
|
||||
|
||||
CommandBuffer* request_command_buffer() override;
|
||||
void submit_command_buffer(CommandBuffer *cmd) override;
|
||||
void wait_idle() override;
|
||||
|
||||
const char* get_renderer_string() override;
|
||||
void log(const char *fmt, ...) override;
|
||||
double get_time() override;
|
||||
|
||||
unsigned get_max_work_group_threads() override;
|
||||
|
||||
const void* map(Buffer *buffer, size_t offset, size_t size) override;
|
||||
void unmap(Buffer *buffer) override;
|
||||
|
||||
// Not supported in GLES, so override when creating platform-specific context.
|
||||
bool supports_texture_readback() override { return false; }
|
||||
void read_texture(void*, Texture*, Format) override {}
|
||||
|
||||
protected:
|
||||
void teardown();
|
||||
|
||||
private:
|
||||
static GLCommandBuffer static_command_buffer;
|
||||
|
||||
enum { MaxBuffersRing = 256 };
|
||||
GLuint ubos[MaxBuffersRing];
|
||||
bool initialized_ubos = false;
|
||||
};
|
||||
|
||||
static inline GLenum convert(AccessMode mode)
|
||||
{
|
||||
switch (mode)
|
||||
{
|
||||
case AccessStreamCopy: return GL_STREAM_COPY;
|
||||
case AccessStaticCopy: return GL_STATIC_COPY;
|
||||
case AccessStreamRead: return GL_STREAM_READ;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline GLenum convert(Format format)
|
||||
{
|
||||
switch (format)
|
||||
{
|
||||
case FormatR16G16B16A16Float: return GL_RGBA16F;
|
||||
case FormatR32G32B32A32Float: return GL_RGBA32F;
|
||||
case FormatR32Float: return GL_R32F;
|
||||
case FormatR16G16Float: return GL_RG16F;
|
||||
case FormatR32G32Float: return GL_RG32F;
|
||||
case FormatR32Uint: return GL_R32UI;
|
||||
case FormatUnknown: return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline GLenum convert_format(Format format)
|
||||
{
|
||||
switch (format)
|
||||
{
|
||||
case FormatR16G16Float: return GL_RG;
|
||||
case FormatR32G32Float: return GL_RG;
|
||||
case FormatR16G16B16A16Float: return GL_RGBA;
|
||||
case FormatR32G32B32A32Float: return GL_RGBA;
|
||||
case FormatR32Float: return GL_RED;
|
||||
case FormatR32Uint: return GL_RED_INTEGER;
|
||||
case FormatUnknown: return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline GLenum convert_type(Format format)
|
||||
{
|
||||
switch (format)
|
||||
{
|
||||
case FormatR16G16Float: return GL_HALF_FLOAT;
|
||||
case FormatR16G16B16A16Float: return GL_HALF_FLOAT;
|
||||
case FormatR32Float: return GL_FLOAT;
|
||||
case FormatR32G32Float: return GL_FLOAT;
|
||||
case FormatR32G32B32A32Float: return GL_FLOAT;
|
||||
case FormatR32Uint: return GL_UNSIGNED_INT;
|
||||
case FormatUnknown: return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
131
glfft/glfft_interface.hpp
Normal file
131
glfft/glfft_interface.hpp
Normal file
@@ -0,0 +1,131 @@
|
||||
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
*
|
||||
* Permission is hereby granted, free of charge,
|
||||
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef GLFFT_INTERFACE_HPP__
|
||||
#define GLFFT_INTERFACE_HPP__
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace GLFFT
|
||||
{
|
||||
class Context;
|
||||
|
||||
class Resource
|
||||
{
|
||||
public:
|
||||
virtual ~Resource() = default;
|
||||
|
||||
// Non-movable, non-copyable to make things simpler.
|
||||
Resource(Resource&&) = delete;
|
||||
void operator=(const Resource&) = delete;
|
||||
|
||||
protected:
|
||||
Resource() = default;
|
||||
};
|
||||
|
||||
class Texture : public Resource {};
|
||||
class Sampler : public Resource {};
|
||||
class Buffer : public Resource {};
|
||||
|
||||
class Program
|
||||
{
|
||||
public:
|
||||
virtual ~Program() = default;
|
||||
protected:
|
||||
friend class Context;
|
||||
Program() = default;
|
||||
};
|
||||
|
||||
enum AccessMode
|
||||
{
|
||||
AccessStreamCopy,
|
||||
AccessStaticCopy,
|
||||
AccessStreamRead
|
||||
};
|
||||
|
||||
enum Format
|
||||
{
|
||||
FormatUnknown,
|
||||
FormatR16G16B16A16Float,
|
||||
FormatR32G32B32A32Float,
|
||||
FormatR32G32Float,
|
||||
FormatR32Float,
|
||||
FormatR16G16Float,
|
||||
FormatR32Uint
|
||||
};
|
||||
|
||||
class CommandBuffer;
|
||||
|
||||
class Context
|
||||
{
|
||||
public:
|
||||
virtual ~Context() = default;
|
||||
|
||||
virtual std::unique_ptr<Texture> create_texture(const void *initial_data,
|
||||
unsigned width, unsigned height,
|
||||
Format format) = 0;
|
||||
|
||||
virtual std::unique_ptr<Buffer> create_buffer(const void *initial_data, size_t size, AccessMode access) = 0;
|
||||
virtual std::unique_ptr<Program> compile_compute_shader(const char *source) = 0;
|
||||
|
||||
virtual CommandBuffer* request_command_buffer() = 0;
|
||||
virtual void submit_command_buffer(CommandBuffer *cmd) = 0;
|
||||
virtual void wait_idle() = 0;
|
||||
|
||||
virtual const char* get_renderer_string() = 0;
|
||||
virtual void log(const char *fmt, ...) = 0;
|
||||
virtual double get_time() = 0;
|
||||
|
||||
virtual unsigned get_max_work_group_threads() = 0;
|
||||
|
||||
virtual const void* map(Buffer *buffer, size_t offset, size_t size) = 0;
|
||||
virtual void unmap(Buffer *buffer) = 0;
|
||||
|
||||
virtual bool supports_texture_readback() = 0;
|
||||
virtual void read_texture(void *buffer, Texture *texture, Format format) = 0;
|
||||
|
||||
protected:
|
||||
Context() = default;
|
||||
};
|
||||
|
||||
class CommandBuffer
|
||||
{
|
||||
public:
|
||||
virtual ~CommandBuffer() = default;
|
||||
|
||||
virtual void bind_program(Program *program) = 0;
|
||||
virtual void bind_storage_texture(unsigned binding, Texture *texture, Format format) = 0;
|
||||
virtual void bind_texture(unsigned binding, Texture *texture) = 0;
|
||||
virtual void bind_sampler(unsigned binding, Sampler *sampler) = 0;
|
||||
virtual void bind_storage_buffer(unsigned binding, Buffer *texture) = 0;
|
||||
virtual void bind_storage_buffer_range(unsigned binding, size_t offset, size_t length, Buffer *texture) = 0;
|
||||
virtual void dispatch(unsigned x, unsigned y, unsigned z) = 0;
|
||||
|
||||
virtual void barrier(Buffer *buffer) = 0;
|
||||
virtual void barrier(Texture *buffer) = 0;
|
||||
virtual void barrier() = 0;
|
||||
|
||||
enum { MaxConstantDataSize = 64 };
|
||||
virtual void push_constant_data(unsigned binding, const void *data, size_t size) = 0;
|
||||
|
||||
protected:
|
||||
CommandBuffer() = default;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
600
glfft/glfft_wisdom.cpp
Normal file
600
glfft/glfft_wisdom.cpp
Normal file
@@ -0,0 +1,600 @@
|
||||
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
*
|
||||
* Permission is hereby granted, free of charge,
|
||||
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "glfft_wisdom.hpp"
|
||||
#include "glfft_interface.hpp"
|
||||
#include "glfft.hpp"
|
||||
#include <utility>
|
||||
|
||||
/* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */
|
||||
#ifdef GLFFT_SERIALIZATION
|
||||
#include "rapidjson/reader.h"
|
||||
#include "rapidjson/prettywriter.h"
|
||||
#include "rapidjson/stringbuffer.h"
|
||||
#include "rapidjson/document.h"
|
||||
using namespace rapidjson;
|
||||
#endif
|
||||
|
||||
#ifdef GLFFT_CLI_ASYNC
|
||||
#include "glfft_cli.hpp"
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace GLFFT;
|
||||
|
||||
FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context)
|
||||
{
|
||||
FFTStaticWisdom res;
|
||||
|
||||
const char *renderer = context->get_renderer_string();
|
||||
unsigned threads = context->get_max_work_group_threads();
|
||||
|
||||
if (strstr(renderer, "GeForce") || strstr(renderer, "Quadro"))
|
||||
{
|
||||
context->log("Detected GeForce/Quadro GPU.\n");
|
||||
res.min_workgroup_size = 32; // Warp threads.
|
||||
res.min_workgroup_size_shared = 32;
|
||||
res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
|
||||
res.min_vector_size = 2;
|
||||
res.max_vector_size = 2;
|
||||
res.shared_banked = FFTStaticWisdom::True;
|
||||
}
|
||||
else if (strstr(renderer, "Radeon"))
|
||||
{
|
||||
context->log("Detected Radeon GPU.\n");
|
||||
res.min_workgroup_size = 64; // Wavefront threads (GCN).
|
||||
res.min_workgroup_size_shared = 128;
|
||||
res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
|
||||
// TODO: Find if we can restrict this to 2 or 4 always.
|
||||
res.min_vector_size = 2;
|
||||
res.max_vector_size = 4;
|
||||
res.shared_banked = FFTStaticWisdom::True;
|
||||
}
|
||||
else if (strstr(renderer, "Mali"))
|
||||
{
|
||||
context->log("Detected Mali GPU.\n");
|
||||
|
||||
res.min_workgroup_size = 4;
|
||||
res.min_workgroup_size_shared = 4;
|
||||
res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.
|
||||
res.min_vector_size = 4;
|
||||
res.max_vector_size = 4;
|
||||
res.shared_banked = FFTStaticWisdom::False;
|
||||
}
|
||||
// TODO: Add more GPUs.
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(
|
||||
Context *context, unsigned Nx, unsigned Ny, unsigned radix,
|
||||
Mode mode, Target input_target, Target output_target,
|
||||
const FFTOptions::Type &type)
|
||||
{
|
||||
WisdomPass pass = {
|
||||
{
|
||||
Nx, Ny, radix, mode, input_target, output_target,
|
||||
type,
|
||||
},
|
||||
0.0,
|
||||
};
|
||||
|
||||
auto itr = library.find(pass);
|
||||
if (itr != end(library))
|
||||
{
|
||||
return make_pair(itr->first.cost, itr->second);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto result = study(context, pass, type);
|
||||
pass.cost = result.first;
|
||||
library[pass] = result.second;
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
void FFTWisdom::learn_optimal_options_exhaustive(Context *context,
|
||||
unsigned Nx, unsigned Ny,
|
||||
Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)
|
||||
{
|
||||
bool learn_resolve = type == ComplexToReal || type == RealToComplex;
|
||||
Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;
|
||||
Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
|
||||
|
||||
// Create wisdom for horizontal transforms and vertical transform.
|
||||
static const unsigned radices[] = { 4, 8, 16, 64 };
|
||||
for (auto radix : radices)
|
||||
{
|
||||
try
|
||||
{
|
||||
// If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.
|
||||
|
||||
// Learn plain transforms.
|
||||
if (Ny > 1)
|
||||
{
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);
|
||||
}
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);
|
||||
|
||||
// Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with
|
||||
// block interleave and larger WG_Y sizes.
|
||||
if (input_target != SSBO)
|
||||
{
|
||||
if (Ny > 1)
|
||||
{
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);
|
||||
}
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);
|
||||
}
|
||||
|
||||
if (output_target != SSBO)
|
||||
{
|
||||
if (Ny > 1)
|
||||
{
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);
|
||||
}
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);
|
||||
}
|
||||
}
|
||||
#ifdef GLFFT_CLI_ASYNC
|
||||
catch (const AsyncCancellation &)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
#endif
|
||||
catch (...)
|
||||
{
|
||||
// If our default options cannot successfully create the radix pass (i.e. throws),
|
||||
// just ignore it for purpose of creating wisdom.
|
||||
}
|
||||
}
|
||||
|
||||
auto resolve_type = fft_type;
|
||||
resolve_type.input_fp16 = resolve_type.output_fp16;
|
||||
Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
|
||||
Target resolve_input_target = SSBO;
|
||||
|
||||
// If we have C2R Nx1 transform, the first pass is resolve, so use those types.
|
||||
if (type == ComplexToReal && Ny == 1)
|
||||
{
|
||||
resolve_type = fft_type;
|
||||
resolve_input_target = input_target;
|
||||
}
|
||||
|
||||
// If we need to do a resolve pass, train this case as well.
|
||||
if (learn_resolve)
|
||||
{
|
||||
try
|
||||
{
|
||||
// If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.
|
||||
if (Ny == 1 && resolve_mode == ResolveRealToComplex)
|
||||
{
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
|
||||
}
|
||||
else
|
||||
{
|
||||
learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);
|
||||
}
|
||||
}
|
||||
#ifdef GLFFT_CLI_ASYNC
|
||||
catch (const AsyncCancellation &)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
#endif
|
||||
catch (...)
|
||||
{
|
||||
// If our default options cannot successfully create the radix pass (i.e. throws),
|
||||
// just ignore it for purpose of creating wisdom.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double FFTWisdom::bench(Context *context, Resource *output, Resource *input,
|
||||
const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const
|
||||
{
|
||||
FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,
|
||||
pass.pass.mode, pass.pass.input_target, pass.pass.output_target,
|
||||
cache, options);
|
||||
|
||||
return fft.bench(context,
|
||||
output, input, params.warmup, params.iterations, params.dispatches, params.timeout);
|
||||
}
|
||||
|
||||
static inline unsigned mode_to_size(Mode mode)
|
||||
{
|
||||
switch (mode)
|
||||
{
|
||||
case VerticalDual:
|
||||
case HorizontalDual:
|
||||
case ResolveRealToComplex:
|
||||
case ResolveComplexToReal:
|
||||
return 4;
|
||||
|
||||
default:
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const
|
||||
{
|
||||
auto cache = make_shared<ProgramCache>();
|
||||
|
||||
unique_ptr<Resource> output;
|
||||
unique_ptr<Resource> input;
|
||||
|
||||
unsigned mode_size = mode_to_size(pass.pass.mode);
|
||||
vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);
|
||||
|
||||
if (pass.pass.input_target == SSBO)
|
||||
{
|
||||
input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy);
|
||||
}
|
||||
else
|
||||
{
|
||||
Format format = FormatUnknown;
|
||||
unsigned Nx = pass.pass.Nx;
|
||||
unsigned Ny = pass.pass.Ny;
|
||||
|
||||
switch (pass.pass.mode)
|
||||
{
|
||||
case VerticalDual:
|
||||
case HorizontalDual:
|
||||
format = FormatR32G32B32A32Float;
|
||||
break;
|
||||
|
||||
case Vertical:
|
||||
case Horizontal:
|
||||
format = FormatR32G32Float;
|
||||
break;
|
||||
|
||||
case ResolveComplexToReal:
|
||||
format = FormatR32G32Float;
|
||||
Nx *= 2;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw logic_error("Invalid input mode.\n");
|
||||
}
|
||||
|
||||
input = context->create_texture(tmp.data(), Nx, Ny, format);
|
||||
}
|
||||
|
||||
if (pass.pass.output_target == SSBO)
|
||||
{
|
||||
output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy);
|
||||
}
|
||||
else
|
||||
{
|
||||
Format format = FormatUnknown;
|
||||
unsigned Nx = pass.pass.Nx;
|
||||
unsigned Ny = pass.pass.Ny;
|
||||
|
||||
switch (pass.pass.mode)
|
||||
{
|
||||
case VerticalDual:
|
||||
case HorizontalDual:
|
||||
format = FormatR32G32B32A32Float;
|
||||
break;
|
||||
|
||||
case Vertical:
|
||||
case Horizontal:
|
||||
format = FormatR32G32Float;
|
||||
break;
|
||||
|
||||
case ResolveRealToComplex:
|
||||
format = FormatR32G32Float;
|
||||
Nx *= 2;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw logic_error("Invalid output mode.\n");
|
||||
}
|
||||
|
||||
output = context->create_texture(nullptr, Nx, Ny, format);
|
||||
}
|
||||
|
||||
// Exhaustive search, look for every sensible combination, and find fastest parameters.
|
||||
// Get initial best cost with defaults.
|
||||
FFTOptions::Performance best_perf;
|
||||
double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache);
|
||||
|
||||
static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };
|
||||
static const unsigned vector_size_values[] = { 2, 4, 8 };
|
||||
static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
|
||||
static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };
|
||||
|
||||
bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex;
|
||||
bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual;
|
||||
unsigned bench_count = 0;
|
||||
|
||||
for (auto shared_banked : shared_banked_values)
|
||||
{
|
||||
// Useless test, since shared banked is only relevant for radix 16/64.
|
||||
if (pass.pass.radix < 16 && shared_banked)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
bool fair_shared_banked = (pass.pass.radix < 16) ||
|
||||
(static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
|
||||
(shared_banked == static_wisdom.shared_banked);
|
||||
|
||||
if (!fair_shared_banked)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
for (auto vector_size : vector_size_values)
|
||||
{
|
||||
// Resolve passes currently only support vector size 2. Shared banked makes no sense either.
|
||||
if (test_resolve && (vector_size != 2 || shared_banked))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// We can only use vector_size 8 with FP16.
|
||||
if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Makes little sense to test since since vector_size will be bumped to 4 anyways.
|
||||
if (test_dual && vector_size < 4)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
for (auto workgroup_size_x : workgroup_size_x_values)
|
||||
{
|
||||
for (auto workgroup_size_y : workgroup_size_y_values)
|
||||
{
|
||||
unsigned workgroup_size = workgroup_size_x * workgroup_size_y;
|
||||
|
||||
unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :
|
||||
static_wisdom.min_workgroup_size;
|
||||
|
||||
unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
|
||||
unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;
|
||||
|
||||
bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
|
||||
workgroup_size >= min_workgroup_size;
|
||||
if (pass.pass.Ny == 1 && workgroup_size_y > 1)
|
||||
{
|
||||
fair_workgroup_size = false;
|
||||
}
|
||||
|
||||
if (!fair_workgroup_size)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we have dual mode, accept vector sizes larger than max.
|
||||
bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
|
||||
vector_size >= min_vector_size);
|
||||
|
||||
if (!fair_vector_size)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
FFTOptions::Performance perf;
|
||||
perf.shared_banked = shared_banked;
|
||||
perf.vector_size = vector_size;
|
||||
perf.workgroup_size_x = workgroup_size_x;
|
||||
perf.workgroup_size_y = workgroup_size_y;
|
||||
|
||||
try
|
||||
{
|
||||
// If workgroup sizes are too big for our test, this will throw.
|
||||
double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache);
|
||||
bench_count++;
|
||||
|
||||
#if 1
|
||||
context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);
|
||||
context->log(" Width: %4u\n", pass.pass.Nx);
|
||||
context->log(" Height: %4u\n", pass.pass.Ny);
|
||||
context->log(" Shared banked: %3s\n", shared_banked ? "yes" : "no");
|
||||
context->log(" Vector size: %u\n", vector_size);
|
||||
context->log(" Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
|
||||
context->log(" Cost: %8.3g\n", cost);
|
||||
#endif
|
||||
|
||||
if (cost < minimum_cost)
|
||||
{
|
||||
#if 1
|
||||
context->log(" New optimal solution! (%g -> %g)\n", minimum_cost, cost);
|
||||
#endif
|
||||
best_perf = perf;
|
||||
minimum_cost = cost;
|
||||
}
|
||||
}
|
||||
#ifdef GLFFT_CLI_ASYNC
|
||||
catch (const AsyncCancellation &)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
#endif
|
||||
catch (...)
|
||||
{
|
||||
// If we pass in bogus parameters,
|
||||
// FFT will throw and we just ignore this.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
context->log("Tested %u variants!\n", bench_count);
|
||||
return make_pair(minimum_cost, best_perf);
|
||||
}
|
||||
|
||||
const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
|
||||
Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const
|
||||
{
|
||||
WisdomPass pass = {
|
||||
{
|
||||
Nx, Ny, radix, mode, input_target, output_target,
|
||||
type,
|
||||
},
|
||||
0.0,
|
||||
};
|
||||
|
||||
auto itr = library.find(pass);
|
||||
return itr != end(library) ? (&(*itr)) : nullptr;
|
||||
}
|
||||
|
||||
const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
|
||||
Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
|
||||
{
|
||||
WisdomPass pass = {
|
||||
{
|
||||
Nx, Ny, radix, mode, input_target, output_target,
|
||||
base_options.type,
|
||||
},
|
||||
0.0,
|
||||
};
|
||||
|
||||
auto itr = library.find(pass);
|
||||
|
||||
#if 0
|
||||
if (itr == end(library))
|
||||
{
|
||||
context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
|
||||
Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));
|
||||
}
|
||||
#endif
|
||||
|
||||
return itr != end(library) ? itr->second : base_options.performance;
|
||||
}
|
||||
|
||||
#ifdef GLFFT_SERIALIZATION
|
||||
std::string FFTWisdom::archive() const
|
||||
{
|
||||
StringBuffer s;
|
||||
PrettyWriter<StringBuffer> writer{s};
|
||||
|
||||
writer.StartObject();
|
||||
writer.String("library");
|
||||
|
||||
// Serialize all wisdom accumulated to a string.
|
||||
writer.StartArray();
|
||||
for (auto &entry : library)
|
||||
{
|
||||
writer.StartObject();
|
||||
|
||||
writer.String("scenario");
|
||||
writer.StartObject();
|
||||
writer.String("nx");
|
||||
writer.Uint(entry.first.pass.Nx);
|
||||
writer.String("ny");
|
||||
writer.Uint(entry.first.pass.Ny);
|
||||
writer.String("radix");
|
||||
writer.Uint(entry.first.pass.radix);
|
||||
writer.String("mode");
|
||||
writer.Uint(entry.first.pass.mode);
|
||||
writer.String("input_target");
|
||||
writer.Uint(entry.first.pass.input_target);
|
||||
writer.String("output_target");
|
||||
writer.Uint(entry.first.pass.output_target);
|
||||
writer.EndObject();
|
||||
|
||||
writer.String("type");
|
||||
writer.StartObject();
|
||||
writer.String("fp16");
|
||||
writer.Bool(entry.first.pass.type.fp16);
|
||||
writer.String("input_fp16");
|
||||
writer.Bool(entry.first.pass.type.input_fp16);
|
||||
writer.String("output_fp16");
|
||||
writer.Bool(entry.first.pass.type.output_fp16);
|
||||
writer.String("normalize");
|
||||
writer.Bool(entry.first.pass.type.normalize);
|
||||
writer.EndObject();
|
||||
|
||||
writer.String("performance");
|
||||
writer.StartObject();
|
||||
writer.String("shared_banked");
|
||||
writer.Bool(entry.second.shared_banked);
|
||||
writer.String("vector_size");
|
||||
writer.Uint(entry.second.vector_size);
|
||||
writer.String("workgroup_size_x");
|
||||
writer.Uint(entry.second.workgroup_size_x);
|
||||
writer.String("workgroup_size_y");
|
||||
writer.Uint(entry.second.workgroup_size_y);
|
||||
writer.EndObject();
|
||||
|
||||
writer.String("cost");
|
||||
writer.Double(entry.first.cost);
|
||||
|
||||
writer.EndObject();
|
||||
}
|
||||
writer.EndArray();
|
||||
writer.EndObject();
|
||||
return s.GetString();
|
||||
}
|
||||
|
||||
void FFTWisdom::extract(const char *json)
|
||||
{
|
||||
Document document;
|
||||
document.Parse(json);
|
||||
|
||||
// Exception safe, we don't want to risk throwing in the middle of the
|
||||
// loop, leaving the library is broken state.
|
||||
unordered_map<WisdomPass, FFTOptions::Performance> new_library;
|
||||
|
||||
auto &lib = document["library"];
|
||||
|
||||
// y u no begin(), end() :(
|
||||
for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr)
|
||||
{
|
||||
auto &v = *itr;
|
||||
|
||||
WisdomPass pass;
|
||||
FFTOptions::Performance perf;
|
||||
|
||||
pass.cost = v["cost"].GetDouble();
|
||||
|
||||
auto &scenario = v["scenario"];
|
||||
pass.pass.Nx = scenario["nx"].GetUint();
|
||||
pass.pass.Ny = scenario["ny"].GetUint();
|
||||
pass.pass.radix = scenario["radix"].GetUint();
|
||||
pass.pass.mode = static_cast<Mode>(scenario["mode"].GetUint());
|
||||
pass.pass.input_target = static_cast<Target>(scenario["input_target"].GetUint());
|
||||
pass.pass.output_target = static_cast<Target>(scenario["output_target"].GetUint());
|
||||
|
||||
auto &type = v["type"];
|
||||
pass.pass.type.fp16 = type["fp16"].GetBool();
|
||||
pass.pass.type.input_fp16 = type["input_fp16"].GetBool();
|
||||
pass.pass.type.output_fp16 = type["output_fp16"].GetBool();
|
||||
pass.pass.type.normalize = type["normalize"].GetBool();
|
||||
|
||||
auto &performance = v["performance"];
|
||||
perf.shared_banked = performance["shared_banked"].GetBool();
|
||||
perf.vector_size = performance["vector_size"].GetUint();
|
||||
perf.workgroup_size_x = performance["workgroup_size_x"].GetUint();
|
||||
perf.workgroup_size_y = performance["workgroup_size_y"].GetUint();
|
||||
|
||||
new_library[pass] = perf;
|
||||
}
|
||||
|
||||
// Exception safe.
|
||||
swap(library, new_library);
|
||||
}
|
||||
#endif
|
||||
|
||||
149
glfft/glfft_wisdom.hpp
Normal file
149
glfft/glfft_wisdom.hpp
Normal file
@@ -0,0 +1,149 @@
|
||||
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
||||
*
|
||||
* Permission is hereby granted, free of charge,
|
||||
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef GLFFT_WISDOM_HPP__
|
||||
#define GLFFT_WISDOM_HPP__
|
||||
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "glfft_common.hpp"
|
||||
#include "glfft_interface.hpp"
|
||||
|
||||
namespace GLFFT
|
||||
{
|
||||
|
||||
struct WisdomPass
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned Nx;
|
||||
unsigned Ny;
|
||||
unsigned radix;
|
||||
Mode mode;
|
||||
Target input_target;
|
||||
Target output_target;
|
||||
FFTOptions::Type type;
|
||||
} pass;
|
||||
|
||||
double cost;
|
||||
|
||||
bool operator==(const WisdomPass &other) const
|
||||
{
|
||||
return std::memcmp(&pass, &other.pass, sizeof(pass)) == 0;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<>
|
||||
struct hash<GLFFT::WisdomPass>
|
||||
{
|
||||
std::size_t operator()(const GLFFT::WisdomPass ¶ms) const
|
||||
{
|
||||
std::size_t h = 0;
|
||||
hash<uint8_t> hasher;
|
||||
for (std::size_t i = 0; i < sizeof(params.pass); i++)
|
||||
{
|
||||
h ^= hasher(reinterpret_cast<const uint8_t*>(¶ms.pass)[i]);
|
||||
}
|
||||
|
||||
return h;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace GLFFT
|
||||
{
|
||||
|
||||
// Adds information which depends on the GPU vendor.
|
||||
// This can speed up learning process, since there will be fewer "obviously wrong" settings to test.
|
||||
struct FFTStaticWisdom
|
||||
{
|
||||
enum Tristate { True = 1, False = 0, DontCare = -1 };
|
||||
|
||||
unsigned min_workgroup_size = 1;
|
||||
unsigned min_workgroup_size_shared = 1;
|
||||
unsigned max_workgroup_size = 128; // GLES 3.1 mandates support for this.
|
||||
unsigned min_vector_size = 2;
|
||||
unsigned max_vector_size = 4;
|
||||
Tristate shared_banked = DontCare;
|
||||
};
|
||||
|
||||
class FFTWisdom
|
||||
{
|
||||
public:
|
||||
std::pair<double, FFTOptions::Performance> learn_optimal_options(Context *ctx,
|
||||
unsigned Nx, unsigned Ny, unsigned radix,
|
||||
Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type);
|
||||
|
||||
void learn_optimal_options_exhaustive(Context *ctx,
|
||||
unsigned Nx, unsigned Ny,
|
||||
Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type);
|
||||
|
||||
const std::pair<const WisdomPass, FFTOptions::Performance>* find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
|
||||
Mode mode, Target input_target, Target output_target, const FFTOptions::Type &base_options) const;
|
||||
|
||||
const FFTOptions::Performance& find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
|
||||
Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const;
|
||||
|
||||
void set_static_wisdom(FFTStaticWisdom static_wisdom) { this->static_wisdom = static_wisdom; }
|
||||
static FFTStaticWisdom get_static_wisdom_from_renderer(Context *context);
|
||||
|
||||
void set_bench_params(unsigned warmup,
|
||||
unsigned iterations, unsigned dispatches, double timeout)
|
||||
{
|
||||
params.warmup = warmup;
|
||||
params.iterations = iterations;
|
||||
params.dispatches = dispatches;
|
||||
params.timeout = timeout;
|
||||
}
|
||||
|
||||
#ifdef GLFFT_SERIALIZATION
|
||||
// Serialization interface.
|
||||
std::string archive() const;
|
||||
void extract(const char *json);
|
||||
#endif
|
||||
|
||||
private:
|
||||
std::unordered_map<WisdomPass, FFTOptions::Performance> library;
|
||||
|
||||
std::pair<double, FFTOptions::Performance> study(Context *context,
|
||||
const WisdomPass &pass, FFTOptions::Type options) const;
|
||||
|
||||
double bench(Context *cmd, Resource *output, Resource *input,
|
||||
const WisdomPass &pass, const FFTOptions &options,
|
||||
const std::shared_ptr<ProgramCache> &cache) const;
|
||||
|
||||
FFTStaticWisdom static_wisdom;
|
||||
|
||||
struct
|
||||
{
|
||||
unsigned warmup = 2;
|
||||
unsigned iterations = 20;
|
||||
unsigned dispatches = 50;
|
||||
double timeout = 1.0;
|
||||
} params;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user