Commit 9642cf83 authored by Florian Oetke's avatar Florian Oetke
Browse files

optimized tone mapping

parent f252d268
......@@ -8,46 +8,81 @@ layout (constant_id = 3) const float HISTOGRAM_MAX = 10;
layout (constant_id = 4) const float DISPLAY_MIN = 0;
layout (constant_id = 5) const float DISPLAY_MAX = 1;
layout (local_size_x = 1, local_size_y = 1, local_size_z = 1 ) in;
layout (local_size_x = 32, local_size_y = 1, local_size_z = 1 ) in;
layout(binding = 1) buffer Data
{
uint histogram[HISTOGRAM_SLOTS + 1];
};
uint calc_total() {
uint sum = 0;
for(uint i=0; i<256; i++) {
sum += histogram[i];
}
shared uint local_histogram[HISTOGRAM_SLOTS];
shared int total;
shared int trimmings;
return sum;
}
void main() {
// TODO: optimize
float total = calc_total();
// calculate constants and initialize shared memory
uint local_id = gl_LocalInvocationID.x + gl_LocalInvocationID.y*gl_WorkGroupSize.x;
uint local_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y;
float world_range = HISTOGRAM_MAX - HISTOGRAM_MIN;
float display_range = DISPLAY_MAX - DISPLAY_MIN;
float bin_width = world_range / HISTOGRAM_SLOTS;
total = 0;
trimmings = 0;
memoryBarrierShared();
barrier();
// copy histogram to shared memory and sum up total
int sub_total = 0;
for(uint i=local_id; i<HISTOGRAM_SLOTS; i+=local_size) {
local_histogram[i] = histogram[i];
sub_total += int(local_histogram[i]);
}
atomicAdd(total, sub_total);
memoryBarrierShared();
barrier();
// trim histogram
float tolerance = 0.025 * total;
float trimmings = total;
int safe_guard = 0;
while(trimmings>tolerance && total>=tolerance && safe_guard++<100) {
trimmings = 0;
while(total>=tolerance && safe_guard++<100) {
int ceiling = int(floor(total*bin_width / display_range));
float ceiling = floor(total*bin_width / display_range);
int sub_trimmings = 0;
for(uint i=0; i<256; i++) {
if(histogram[i] > ceiling) {
trimmings += histogram[i] - ceiling;
histogram[i] = uint(ceiling);
// trim each bucket
for(uint i=local_id; i<HISTOGRAM_SLOTS; i+=local_size) {
if(local_histogram[i] > ceiling) {
sub_trimmings += int(local_histogram[i]) - ceiling;
local_histogram[i] = uint(ceiling);
}
}
total -= trimmings;
// update total and sum sub-trimmings to check if we are done
memoryBarrierShared();
barrier();
atomicAdd(total, -sub_trimmings);
atomicAdd(trimmings, sub_trimmings);
memoryBarrierShared();
barrier();
if(trimmings < tolerance)
break;
trimmings = 0;
}
// write result back to global memory
for(uint i=local_id; i<HISTOGRAM_SLOTS; i+=local_size) {
histogram[i] = local_histogram[i];
}
}
......@@ -8,7 +8,7 @@ layout (constant_id = 3) const float HISTOGRAM_MAX = 10;
layout (constant_id = 4) const float DISPLAY_MIN = 0;
layout (constant_id = 5) const float DISPLAY_MAX = 1;
layout (local_size_x = 1, local_size_y = 1, local_size_z = 1 ) in;
layout (local_size_x = 128, local_size_y = 1, local_size_z = 1 ) in;
layout (binding = 2, r16f) uniform writeonly image2D adjustment_factor;
......@@ -17,14 +17,61 @@ layout(binding = 1) buffer Data
uint histogram[HISTOGRAM_SLOTS + 1];
};
shared uint prefix_sum[HISTOGRAM_SLOTS];
shared uint prefix_sum[HISTOGRAM_SLOTS+1];
// based on https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html
void build_prefix_sum() {
prefix_sum[0] = 0;
prefix_sum[1] = 0;
for(uint i=2; i<HISTOGRAM_SLOTS; i++) {
int local_id = int(gl_LocalInvocationID.x);
int offset = 1;
prefix_sum[2*local_id] = histogram[2*local_id];
prefix_sum[2*local_id+1] = histogram[2*local_id+1];
// up-sweep phase
for (int d = HISTOGRAM_SLOTS/2; d > 0; d /=2) {
memoryBarrierShared();
barrier();
if (local_id < d) {
uint ai = offset*(2*local_id+1)-1;
uint bi = offset*(2*local_id+2)-1;
prefix_sum[bi] += prefix_sum[ai];
}
offset *= 2;
}
// clear the last element
if (local_id == 0) { prefix_sum[HISTOGRAM_SLOTS - 1] = 0; }
// down-sweep phase
for (int d = 1; d < HISTOGRAM_SLOTS; d *= 2) {
offset /= 2;
memoryBarrierShared();
barrier();
if (local_id < d) {
uint ai = offset*(2*local_id+1)-1;
uint bi = offset*(2*local_id+2)-1;
uint t = prefix_sum[ai];
prefix_sum[ai] = prefix_sum[bi];
prefix_sum[bi] += t;
}
}
// store the total in the last element
if (local_id == 0) {
prefix_sum[HISTOGRAM_SLOTS] = prefix_sum[HISTOGRAM_SLOTS-1] + histogram[HISTOGRAM_SLOTS-1];
}
memoryBarrierShared();
barrier();
/* old/sequencial
prefix_sum[0] = 0;
prefix_sum[HISTOGRAM_SLOTS*0+1] = 0; // ignore first bucket
for(uint i=2; i<HISTOGRAM_SLOTS+1; i++) {
prefix_sum[i] = prefix_sum[i-1] + histogram[i-1];
}
*/
}
float index_to_log_lum(uint index) {
......@@ -32,18 +79,19 @@ float index_to_log_lum(uint index) {
}
void main() {
// TODO: optimize
// calculate constants
uint local_id = gl_LocalInvocationID.x;
uint local_size = gl_WorkGroupSize.x;
build_prefix_sum();
float sum = prefix_sum[HISTOGRAM_SLOTS];
float sum = prefix_sum[HISTOGRAM_SLOTS-1] + histogram[HISTOGRAM_SLOTS-1];
for(int i=0; i<HISTOGRAM_SLOTS; i++) {
for(uint i=local_id; i<HISTOGRAM_SLOTS; i+=local_size) {
float factor = exp(DISPLAY_MIN + (DISPLAY_MAX-DISPLAY_MIN) * float(prefix_sum[i])/sum)
/ exp(index_to_log_lum(i));
imageStore(adjustment_factor, ivec2(i,0), vec4(factor));
//histogram[i] = uint(factor*100);
// DEBUG-PRINT: histogram[i] = prefix_sum[i];
}
}
......@@ -34,13 +34,13 @@ void main() {
uint local_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y;
uvec2 stride = gl_WorkGroupSize.xy * gl_NumWorkGroups.xy;
// zero initialize block local histogram
// zero initialize local histogram
for(uint i=local_id; i<HISTOGRAM_SLOTS; i+=local_size){
local_histogram[i] = 0;
}
memoryBarrierShared();
barrier();
groupMemoryBarrier();
// build local histogram
for(uint x=gl_GlobalInvocationID.x; x<data_size.x; x+=stride.x) {
......@@ -50,15 +50,16 @@ void main() {
}
}
memoryBarrierShared();
barrier();
groupMemoryBarrier();
// copy local histogram to global memory
for(uint i=local_id; i<HISTOGRAM_SLOTS; i+=local_size){
atomicAdd(histogram[i], local_histogram[i]);
}
/*
/* OLD:
float lum = imageLoad(input_image, ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y)).r;
lum = clamp(lum, HISTOGRAM_MIN, HISTOGRAM_MAX);
......
......@@ -472,10 +472,6 @@ namespace mirrage {
nk_property_float(ctx, "Exposure", 0.f, &renderer_settings.exposure_override, 50.f, 0.01f, 0.1f);
bool_nk_wrapper = renderer_settings.histogram_adjustment ? 1 : 0;
nk_checkbox_label(ctx, "Histogram Adjustment", &bool_nk_wrapper);
renderer_settings.histogram_adjustment = bool_nk_wrapper == 1;
nk_property_float(
ctx, "Background Brightness", 0.f, &renderer_settings.background_intensity, 10.f, 1, 0.1f);
......@@ -671,6 +667,19 @@ namespace mirrage {
std::to_string(static_cast<int>(_window_width * _window_height - histogram_sum))
.c_str(),
NK_TEXT_CENTERED);
auto renderer_settings = _meta_system.renderer().settings();
auto bool_nk_wrapper = 0;
bool_nk_wrapper = renderer_settings.histogram_adjustment ? 1 : 0;
nk_checkbox_label(ctx, "Histogram Adjustment", &bool_nk_wrapper);
renderer_settings.histogram_adjustment = bool_nk_wrapper == 1;
bool_nk_wrapper = renderer_settings.histogram_trim ? 1 : 0;
nk_checkbox_label(ctx, "Histogram Trim", &bool_nk_wrapper);
renderer_settings.histogram_trim = bool_nk_wrapper == 1;
_meta_system.renderer().settings(renderer_settings, false);
}
nk_end(ctx);
......
......@@ -40,6 +40,7 @@ namespace mirrage::renderer {
int gi_low_quality_mip_levels = 0;
float exposure_override = 0.f;
bool histogram_adjustment = true;
bool histogram_trim = true;
bool ssao = true;
bool bloom = true;
......
......@@ -13,7 +13,8 @@ namespace mirrage::renderer {
constexpr auto histogram_buffer_length = histogram_slots + 1;
constexpr auto histogram_buffer_size = histogram_buffer_length * sizeof(float);
static_assert(sizeof(float) == sizeof(std::uint32_t));
constexpr auto workgroup_size = 16;
constexpr auto workgroup_size = 32;
constexpr auto histogram_batch_size = 16;
constexpr auto histogram_host_visible =
#ifdef HPC_HISTOGRAM_DEBUG_VIEW
true;
......@@ -105,12 +106,12 @@ namespace mirrage::renderer {
vk::SpecializationMapEntry{3, 3 * 32, 32},
vk::SpecializationMapEntry{4, 4 * 32, 32},
vk::SpecializationMapEntry{5, 5 * 32, 32}};
auto spec_data = std::array<char, 4 * 32>();
auto spec_data = std::array<char, 6 * 32>();
reinterpret_cast<std::int32_t&>(spec_data[0 * 32]) = histogram_slots;
reinterpret_cast<std::int32_t&>(spec_data[1 * 32]) = workgroup_size;
reinterpret_cast<float&>(spec_data[2 * 32]) = std::log(histogram_min);
reinterpret_cast<float&>(spec_data[3 * 32]) = std::log(histogram_max);
reinterpret_cast<float&>(spec_data[4 * 32]) = std::log(0.001f);
reinterpret_cast<float&>(spec_data[4 * 32]) = std::log(1.f / 255.f * 0.4f);
reinterpret_cast<float&>(spec_data[5 * 32]) = std::log(1.0f);
auto spec_info = vk::SpecializationInfo{
......@@ -297,7 +298,9 @@ namespace mirrage::renderer {
_extract_luminance(command_buffer);
_dispatch_build_histogram(command_buffer);
_dispatch_compute_exposure(command_buffer);
_dispatch_adjust_histogram(command_buffer);
if(_renderer.settings().histogram_trim) {
_dispatch_adjust_histogram(command_buffer);
}
_dispatch_build_final_factors(command_buffer);
#else // TODO
......@@ -405,9 +408,10 @@ namespace mirrage::renderer {
vk::PipelineBindPoint::eCompute, *_compute_pipeline_layout, 0, 1, &desc_set, 0, nullptr);
command_buffer.dispatch(
static_cast<std::uint32_t>(std::ceil(_luminance_buffer.width() / float(workgroup_size * 16))),
static_cast<std::uint32_t>(
std::ceil(_luminance_buffer.height() / float(workgroup_size * 16))),
std::ceil(_luminance_buffer.width() / float(workgroup_size * histogram_batch_size))),
static_cast<std::uint32_t>(
std::ceil(_luminance_buffer.height() / float(workgroup_size * histogram_batch_size))),
1);
}
void Tone_mapping_pass::_dispatch_compute_exposure(vk::CommandBuffer& command_buffer)
......@@ -489,7 +493,7 @@ namespace mirrage::renderer {
auto target_barrier = vk::ImageMemoryBarrier{
vk::AccessFlagBits::eShaderRead,
vk::AccessFlagBits::eShaderWrite,
vk::ImageLayout::eShaderReadOnlyOptimal,
vk::ImageLayout::eUndefined,
vk::ImageLayout::eGeneral,
VK_QUEUE_FAMILY_IGNORED,
VK_QUEUE_FAMILY_IGNORED,
......@@ -550,6 +554,6 @@ namespace mirrage::renderer {
util::maybe<std::uint32_t>,
graphic::Device_create_info& create_info)
{
//create_info.features.shaderStorageImageExtendedFormats = true;
create_info.features.shaderStorageImageExtendedFormats = true;
}
} // namespace mirrage::renderer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment