diff --git a/ofrak_gpu/.coveragerc b/ofrak_gpu/.coveragerc index e3a1cb90d..2d5ec5083 100644 --- a/ofrak_gpu/.coveragerc +++ b/ofrak_gpu/.coveragerc @@ -1,5 +1,5 @@ [run] omit = ofrak_gpu/entropy.py - ofrak_gpu/run.py - ofrak_gpu/bench.py + ofrak_gpu/run_entropy.py + ofrak_gpu/bench_entropy.py diff --git a/ofrak_gpu/.gitignore b/ofrak_gpu/.gitignore deleted file mode 100644 index 732dc4d3c..000000000 --- a/ofrak_gpu/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# This file will be automatically generated from entropy.fut on build, and is quite unrult for VCS -ofrak_gpu/entropy.py diff --git a/ofrak_gpu/README.md b/ofrak_gpu/README.md index 56a8e9af7..801945f2a 100644 --- a/ofrak_gpu/README.md +++ b/ofrak_gpu/README.md @@ -1 +1,25 @@ -# ofrak_gpu +# OFRAK +OFRAK (Open Firmware Reverse Analysis Konsole) is a binary analysis and modification platform that combines the ability to unpack, analyze, modify, and repack binaries. + + +# Package: ofrak_gpu + +``` +OFRAK +└───... +└───ofrak_gpu <-- //YOU ARE HERE// +│ └───entropy.fut // Futhark source code for GPU-bound entropy calculations +│ └───entropy.py // Pyopencl equivalent of entropy.fut, generated by the futhark compiler (see "Futhark Compilation") +│ └───run_entropy.py // CLI to run compute the entropy of a file +└───... +``` + +This package contains the GPU-bound implementations of expensive computations performed by OFRAK. + +## Futhark Compilation +The [Futhark language](https://futhark-lang.org/) is not required to install and run this module, nor will the Futhark compiler be installed into your Docker container. Instead, `entropy.py`, the compiled pyopencl output of `entropy.fut`, has been provided. If you would like to make any changes to `entropy.fut`, you will have to re-compile `entropy.py` with `futhark pyopencl --library entropy.fut -o entropy.py` - see the `make futhark` target. The `--library` flag must be used. +## Dependencies +This package requires numpy, pyopencl, and an OpenCL platform to run. If you don't have an OpenCL platform and want to test your code, consider [oclgrind, pocl, or another pip-installable CPU runtime for OpenCL](https://documen.tician.de/pyopencl/misc.html#enabling-access-to-cpus-and-gpus-via-py-opencl). oclgrind is installed by default by requirements-test to run tests. The Futhark compiler is not required, see "Futhark Compilation." + +## Testing +This package maintains 100% test coverage of functions. See `ofrak_gpu_test`. diff --git a/ofrak_gpu/max_entropy_256_B_windows.bin b/ofrak_gpu/max_entropy_256_B_windows.bin deleted file mode 100644 index eb5d461ee..000000000 Binary files a/ofrak_gpu/max_entropy_256_B_windows.bin and /dev/null differ diff --git a/ofrak_gpu/ofrak_gpu/entropy.py b/ofrak_gpu/ofrak_gpu/entropy.py new file mode 100644 index 000000000..e79d169e7 --- /dev/null +++ b/ofrak_gpu/ofrak_gpu/entropy.py @@ -0,0 +1,12306 @@ +# Generated by Futhark 0.25.17. +# Compiled with GHC 9.8.2. +import sys +import numpy as np +import ctypes as ct + +# Stub code for OpenCL setup. + +import pyopencl as cl +import numpy as np +import sys + +if cl.version.VERSION < (2015, 2): + raise Exception( + "Futhark requires at least PyOpenCL version 2015.2. Installed version is %s." + % cl.version.VERSION_TEXT + ) + +TR_BLOCK_DIM = 16 +TR_TILE_DIM = TR_BLOCK_DIM * 2 +TR_ELEMS_PER_THREAD = 8 + + +def parse_preferred_device(s): + pref_num = 0 + if len(s) > 1 and s[0] == "#": + i = 1 + while i < len(s): + if not s[i].isdigit(): + break + else: + pref_num = pref_num * 10 + int(s[i]) + i += 1 + while i < len(s) and s[i].isspace(): + i += 1 + return (s[i:], pref_num) + else: + return (s, 0) + + +def get_prefered_context(interactive=False, platform_pref=None, device_pref=None): + if device_pref != None: + (device_pref, device_num) = parse_preferred_device(device_pref) + else: + device_num = 0 + + if interactive: + return cl.create_some_context(interactive=True) + + def blacklisted(p, d): + return ( + platform_pref == None + and device_pref == None + and p.name == "Apple" + and d.name.find("Intel(R) Core(TM)") >= 0 + ) + + def platform_ok(p): + return not platform_pref or p.name.find(platform_pref) >= 0 + + def device_ok(d): + return not device_pref or d.name.find(device_pref) >= 0 + + device_matches = 0 + + for p in cl.get_platforms(): + if not platform_ok(p): + continue + for d in p.get_devices(): + if blacklisted(p, d) or not device_ok(d): + continue + if device_matches == device_num: + return cl.Context(devices=[d]) + else: + device_matches += 1 + raise Exception("No OpenCL platform and device matching constraints found.") + + +def param_assignment(s): + name, value = s.split("=") + return (name, int(value)) + + +def check_types(self, required_types): + if "f64" in required_types: + if self.device.get_info(cl.device_info.PREFERRED_VECTOR_WIDTH_DOUBLE) == 0: + raise Exception( + "Program uses double-precision floats, but this is not supported on chosen device: %s" + % self.device.name + ) + + +def apply_size_heuristics(self, size_heuristics, sizes): + for platform_name, device_type, size, valuef in size_heuristics: + if ( + sizes[size] == None + and self.platform.name.find(platform_name) >= 0 + and (self.device.type & device_type) == device_type + ): + sizes[size] = valuef(self.device) + return sizes + + +def to_c_str_rep(x): + if type(x) is bool or type(x) is np.bool_: + if x: + return "true" + else: + return "false" + else: + return str(x) + + +def initialise_opencl_object( + self, + program_src="", + build_options=[], + command_queue=None, + interactive=False, + platform_pref=None, + device_pref=None, + default_group_size=None, + default_num_groups=None, + default_tile_size=None, + default_reg_tile_size=None, + default_threshold=None, + size_heuristics=[], + required_types=[], + all_sizes={}, + user_sizes={}, + constants=[], +): + if command_queue is None: + self.ctx = get_prefered_context(interactive, platform_pref, device_pref) + self.queue = cl.CommandQueue(self.ctx) + else: + self.ctx = command_queue.context + self.queue = command_queue + self.device = self.queue.device + self.platform = self.device.platform + self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue)) + device_type = self.device.type + + check_types(self, required_types) + + max_group_size = int(self.device.max_work_group_size) + max_tile_size = int(np.sqrt(self.device.max_work_group_size)) + + self.max_thread_block_size = max_group_size + self.max_tile_size = max_tile_size + self.max_threshold = 0 + self.max_grid_size = 0 + + self.max_shared_memory = int(self.device.local_mem_size) + + # Futhark reserves 4 bytes of local memory for its own purposes. + self.max_shared_memory -= 4 + + # See comment in rts/c/opencl.h. + if self.platform.name.find("NVIDIA CUDA") >= 0: + self.max_shared_memory -= 12 + elif self.platform.name.find("AMD") >= 0: + self.max_shared_memory -= 16 + + self.max_registers = int(2**16) # Not sure how to query for this. + + self.max_cache = self.device.get_info(cl.device_info.GLOBAL_MEM_CACHE_SIZE) + + if self.max_cache == 0: + self.max_cache = 1024 * 1024 + + self.free_list = {} + + self.global_failure = self.pool.allocate(np.int32().itemsize) + cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize) + self.global_failure_args = self.pool.allocate( + np.int64().itemsize * (self.global_failure_args_max + 1) + ) + self.failure_is_an_option = np.int32(0) + + if "default_group_size" in sizes: + default_group_size = sizes["default_group_size"] + del sizes["default_group_size"] + + if "default_num_groups" in sizes: + default_num_groups = sizes["default_num_groups"] + del sizes["default_num_groups"] + + if "default_tile_size" in sizes: + default_tile_size = sizes["default_tile_size"] + del sizes["default_tile_size"] + + if "default_reg_tile_size" in sizes: + default_reg_tile_size = sizes["default_reg_tile_size"] + del sizes["default_reg_tile_size"] + + if "default_threshold" in sizes: + default_threshold = sizes["default_threshold"] + del sizes["default_threshold"] + + default_group_size_set = default_group_size != None + default_tile_size_set = default_tile_size != None + default_sizes = apply_size_heuristics( + self, + size_heuristics, + { + "group_size": default_group_size, + "tile_size": default_tile_size, + "reg_tile_size": default_reg_tile_size, + "num_groups": default_num_groups, + "lockstep_width": None, + "threshold": default_threshold, + }, + ) + default_group_size = default_sizes["group_size"] + default_num_groups = default_sizes["num_groups"] + default_threshold = default_sizes["threshold"] + default_tile_size = default_sizes["tile_size"] + default_reg_tile_size = default_sizes["reg_tile_size"] + lockstep_width = default_sizes["lockstep_width"] + + if default_group_size > max_group_size: + if default_group_size_set: + sys.stderr.write( + "Note: Device limits group size to {} (down from {})\n".format( + max_tile_size, default_group_size + ) + ) + default_group_size = max_group_size + + if default_tile_size > max_tile_size: + if default_tile_size_set: + sys.stderr.write( + "Note: Device limits tile size to {} (down from {})\n".format( + max_tile_size, default_tile_size + ) + ) + default_tile_size = max_tile_size + + for k, v in user_sizes.items(): + if k in all_sizes: + all_sizes[k]["value"] = v + else: + raise Exception( + "Unknown size: {}\nKnown sizes: {}".format(k, " ".join(all_sizes.keys())) + ) + + self.sizes = {} + for k, v in all_sizes.items(): + if v["class"] == "thread_block_size": + max_value = max_group_size + default_value = default_group_size + elif v["class"] == "grid_size": + max_value = max_group_size # Intentional! + default_value = default_num_groups + elif v["class"] == "tile_size": + max_value = max_tile_size + default_value = default_tile_size + elif v["class"] == "reg_tile_size": + max_value = None + default_value = default_reg_tile_size + elif v["class"].startswith("threshold"): + max_value = None + default_value = default_threshold + else: + # Bespoke sizes have no limit or default. + max_value = None + if v["value"] == None: + self.sizes[k] = default_value + elif max_value != None and v["value"] > max_value: + sys.stderr.write( + "Note: Device limits {} to {} (down from {}\n".format(k, max_value, v["value"]) + ) + self.sizes[k] = max_value + else: + self.sizes[k] = v["value"] + + # XXX: we perform only a subset of z-encoding here. Really, the + # compiler should provide us with the variables to which + # parameters are mapped. + if len(program_src) >= 0: + build_options += [f"-DLOCKSTEP_WIDTH={lockstep_width}"] + + build_options += ["-D{}={}".format("max_thread_block_size", max_group_size)] + + build_options += [ + "-D{}={}".format( + s.replace("z", "zz").replace(".", "zi").replace("#", "zh").replace("'", "zq"), + v, + ) + for (s, v) in self.sizes.items() + ] + + build_options += [f"-D{s}={to_c_str_rep(f())}" for (s, f) in constants] + + if self.platform.name == "Oclgrind": + build_options += ["-DEMULATE_F16"] + + build_options += [ + f"-DTR_BLOCK_DIM={TR_BLOCK_DIM}", + f"-DTR_TILE_DIM={TR_TILE_DIM}", + f"-DTR_ELEMS_PER_THREAD={TR_ELEMS_PER_THREAD}", + ] + + program = cl.Program(self.ctx, program_src).build(build_options) + + self.transpose_kernels = { + 1: { + "default": program.map_transpose_1b, + "low_height": program.map_transpose_1b_low_height, + "low_width": program.map_transpose_1b_low_width, + "small": program.map_transpose_1b_small, + "large": program.map_transpose_1b_large, + }, + 2: { + "default": program.map_transpose_2b, + "low_height": program.map_transpose_2b_low_height, + "low_width": program.map_transpose_2b_low_width, + "small": program.map_transpose_2b_small, + "large": program.map_transpose_2b_large, + }, + 4: { + "default": program.map_transpose_4b, + "low_height": program.map_transpose_4b_low_height, + "low_width": program.map_transpose_4b_low_width, + "small": program.map_transpose_4b_small, + "large": program.map_transpose_4b_large, + }, + 8: { + "default": program.map_transpose_8b, + "low_height": program.map_transpose_8b_low_height, + "low_width": program.map_transpose_8b_low_width, + "small": program.map_transpose_8b_small, + "large": program.map_transpose_8b_large, + }, + } + + self.copy_kernels = { + 1: program.lmad_copy_1b, + 2: program.lmad_copy_2b, + 4: program.lmad_copy_4b, + 8: program.lmad_copy_8b, + } + + return program + + +def opencl_alloc(self, min_size, tag): + min_size = 1 if min_size == 0 else min_size + assert min_size > 0 + return self.pool.allocate(min_size) + + +def opencl_free_all(self): + self.pool.free_held() + + +def sync(self): + failure = np.empty(1, dtype=np.int32) + cl.enqueue_copy(self.queue, failure, self.global_failure, is_blocking=True) + self.failure_is_an_option = np.int32(0) + if failure[0] >= 0: + # Reset failure information. + cl.enqueue_fill_buffer( + self.queue, + self.global_failure, + np.int32(-1), + 0, + np.int32().itemsize, + ) + + # Read failure args. + failure_args = np.empty(self.global_failure_args_max + 1, dtype=np.int64) + cl.enqueue_copy( + self.queue, + failure_args, + self.global_failure_args, + is_blocking=True, + ) + + raise Exception(self.failure_msgs[failure[0]].format(*failure_args)) + + +def map_transpose_gpu2gpu(self, elem_size, dst, dst_offset, src, src_offset, k, n, m): + kernels = self.transpose_kernels[elem_size] + kernel = kernels["default"] + mulx = TR_BLOCK_DIM / n + muly = TR_BLOCK_DIM / m + + group_dims = (TR_TILE_DIM, TR_TILE_DIM // TR_ELEMS_PER_THREAD, 1) + dims = ( + (m + TR_TILE_DIM - 1) // TR_TILE_DIM * group_dims[0], + (n + TR_TILE_DIM - 1) // TR_TILE_DIM * group_dims[1], + k, + ) + + k32 = np.int32(k) + n32 = np.int32(n) + m32 = np.int32(m) + mulx32 = np.int32(mulx) + muly32 = np.int32(muly) + + kernel.set_args( + cl.LocalMemory(TR_TILE_DIM * (TR_TILE_DIM + 1) * elem_size), + dst, + dst_offset, + src, + src_offset, + k32, + m32, + n32, + mulx32, + muly32, + np.int32(0), + np.int32(0), + ) + cl.enqueue_nd_range_kernel(self.queue, kernel, dims, group_dims) + + +def copy_elements_gpu2gpu( + self, + elem_size, + dst, + dst_offset, + dst_strides, + src, + src_offset, + src_strides, + shape, +): + r = len(shape) + if r > 8: + raise Exception("Futhark runtime limitation:\nCannot copy array of greater than rank 8.\n") + + n = np.prod(shape) + zero = np.int64(0) + layout_args = [None] * (8 * 3) + for i in range(8): + if i < r: + layout_args[i * 3 + 0] = shape[i] + layout_args[i * 3 + 1] = dst_strides[i] + layout_args[i * 3 + 2] = src_strides[i] + else: + layout_args[i * 3 + 0] = zero + layout_args[i * 3 + 1] = zero + layout_args[i * 3 + 2] = zero + + kernel = self.copy_kernels[elem_size] + kernel.set_args( + cl.LocalMemory(1), + dst, + dst_offset, + src, + src_offset, + n, + np.int32(r), + *layout_args, + ) + w = 256 + dims = ((n + w - 1) // w * w,) + group_dims = (w,) + cl.enqueue_nd_range_kernel(self.queue, kernel, dims, group_dims) + + +def lmad_copy_gpu2gpu(self, pt, dst, dst_offset, dst_strides, src, src_offset, src_strides, shape): + elem_size = ct.sizeof(pt) + nbytes = np.prod(shape) * elem_size + if nbytes == 0: + return None + if lmad_memcpyable(dst_strides, src_strides, shape): + cl.enqueue_copy( + self.queue, + dst, + src, + dst_offset=dst_offset * elem_size, + src_offset=src_offset * elem_size, + byte_count=nbytes, + ) + else: + tr = lmad_map_tr(dst_strides, src_strides, shape) + if tr is not None: + (k, n, m) = tr + map_transpose_gpu2gpu(self, elem_size, dst, dst_offset, src, src_offset, k, m, n) + else: + copy_elements_gpu2gpu( + self, + elem_size, + dst, + dst_offset, + dst_strides, + src, + src_offset, + src_strides, + shape, + ) + + +import pyopencl.array +import time + +sizes = {} +synchronous = False +preferred_platform = None +build_options = [] +preferred_device = None +default_threshold = None +default_group_size = None +default_num_groups = None +default_tile_size = None +default_reg_tile_size = None +fut_opencl_src = """#define FUTHARK_OPENCL +// Start of prelude.cl + +#define SCALAR_FUN_ATTR static inline +#define FUTHARK_FUN_ATTR static + +typedef char int8_t; +typedef short int16_t; +typedef int int32_t; +typedef long int64_t; + +typedef uchar uint8_t; +typedef ushort uint16_t; +typedef uint uint32_t; +typedef ulong uint64_t; + +#define get_tblock_id(d) get_group_id(d) +#define get_num_tblocks(d) get_num_groups(d) + +// Clang-based OpenCL implementations need this for 'static' to work. +#ifdef cl_clang_storage_class_specifiers +#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable +#endif +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + +#ifdef FUTHARK_F64_ENABLED +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + +// NVIDIAs OpenCL does not create device-wide memory fences (see #734), so we +// use inline assembly if we detect we are on an NVIDIA GPU. +#ifdef cl_nv_pragma_unroll +static inline void mem_fence_global() { + asm("membar.gl;"); +} +#else +static inline void mem_fence_global() { + mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); +} +#endif +static inline void mem_fence_local() { + mem_fence(CLK_LOCAL_MEM_FENCE); +} + +static inline void barrier_local() { + barrier(CLK_LOCAL_MEM_FENCE); +} + +// Important for this to be int64_t so it has proper alignment for any type. +#define SHARED_MEM_PARAM __local uint64_t* shared_mem, +#define FUTHARK_KERNEL __kernel +#define FUTHARK_KERNEL_SIZED(a,b,c) __attribute__((reqd_work_group_size(a, b, c))) __kernel + +// End of prelude.cl +// Start of half.h. + +// Conversion functions are from http://half.sourceforge.net/, but +// translated to C. +// +// Copyright (c) 2012-2021 Christian Rau +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef __OPENCL_VERSION__ +#define __constant +#endif + +__constant static const uint16_t base_table[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, + 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, + 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, + 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, + 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 }; + +__constant static const unsigned char shift_table[512] = { + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 }; + +__constant static const uint32_t mantissa_table[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, + 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, + 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, + 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, + 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, + 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, + 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, + 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, + 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, + 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, + 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, + 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, + 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, + 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, + 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, + 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, + 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, + 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, + 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, + 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, + 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, + 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, + 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, + 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, + 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, + 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, + 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, + 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, + 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, + 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, + 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, + 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, + 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, + 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, + 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, + 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, + 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, + 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, + 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, + 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, + 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, + 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, + 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, + 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, + 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, + 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, + 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, + 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, + 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, + 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, + 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, + 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, + 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, + 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, + 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, + 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, + 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, + 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, + 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, + 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, + 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, + 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, + 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, + 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, + 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, + 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, + 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, + 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, + 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, + 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, + 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, + 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, + 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, + 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, + 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, + 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, + 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, + 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, + 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, + 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, + 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, + 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, + 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, + 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, + 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, + 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, + 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, + 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, + 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, + 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, + 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, + 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, + 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, + 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, + 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, + 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, + 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, + 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, + 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, + 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, + 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, + 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 }; +__constant static const uint32_t exponent_table[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, + 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, + 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 }; +__constant static const unsigned short offset_table[64] = { + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; + +SCALAR_FUN_ATTR uint16_t float2halfbits(float value) { + union { float x; uint32_t y; } u; + u.x = value; + uint32_t bits = u.y; + + uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);; + + return hbits; +} + +SCALAR_FUN_ATTR float halfbits2float(uint16_t value) { + uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10]; + + union { uint32_t x; float y; } u; + u.x = bits; + return u.y; +} + +SCALAR_FUN_ATTR uint16_t halfbitsnextafter(uint16_t from, uint16_t to) { + int fabs = from & 0x7FFF, tabs = to & 0x7FFF; + if(fabs > 0x7C00 || tabs > 0x7C00) { + return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200); + } + if(from == to || !(fabs|tabs)) { + return to; + } + if(!fabs) { + return (to&0x8000)+1; + } + unsigned int out = + from + + (((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1) + - 1; + return out; +} + +// End of half.h. +// Start of scalar.h. + +// Implementation of the primitive scalar operations. Very +// repetitive. This code is inserted directly into both CUDA and +// OpenCL programs, as well as the CPU code, so it has some #ifdefs to +// work everywhere. Some operations are defined as macros because +// this allows us to use them as constant expressions in things like +// array sizes and static initialisers. + +// Some of the #ifdefs are because OpenCL uses type-generic functions +// for some operations (e.g. sqrt), while C and CUDA sensibly use +// distinct functions for different precisions (e.g. sqrtf() and +// sqrt()). This is quite annoying. Due to C's unfortunate casting +// rules, it is also really easy to accidentally implement +// floating-point functions in the wrong precision, so be careful. + +// Double-precision definitions are only included if the preprocessor +// macro FUTHARK_F64_ENABLED is set. + +SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x); +SCALAR_FUN_ATTR float futrts_from_bits32(int32_t x); + +SCALAR_FUN_ATTR uint8_t add8(uint8_t x, uint8_t y) { + return x + y; +} + +SCALAR_FUN_ATTR uint16_t add16(uint16_t x, uint16_t y) { + return x + y; +} + +SCALAR_FUN_ATTR uint32_t add32(uint32_t x, uint32_t y) { + return x + y; +} + +SCALAR_FUN_ATTR uint64_t add64(uint64_t x, uint64_t y) { + return x + y; +} + +SCALAR_FUN_ATTR uint8_t sub8(uint8_t x, uint8_t y) { + return x - y; +} + +SCALAR_FUN_ATTR uint16_t sub16(uint16_t x, uint16_t y) { + return x - y; +} + +SCALAR_FUN_ATTR uint32_t sub32(uint32_t x, uint32_t y) { + return x - y; +} + +SCALAR_FUN_ATTR uint64_t sub64(uint64_t x, uint64_t y) { + return x - y; +} + +SCALAR_FUN_ATTR uint8_t mul8(uint8_t x, uint8_t y) { + return x * y; +} + +SCALAR_FUN_ATTR uint16_t mul16(uint16_t x, uint16_t y) { + return x * y; +} + +SCALAR_FUN_ATTR uint32_t mul32(uint32_t x, uint32_t y) { + return x * y; +} + +SCALAR_FUN_ATTR uint64_t mul64(uint64_t x, uint64_t y) { + return x * y; +} + +#if ISPC + +SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) { + // This strange pattern is used to prevent the ISPC compiler from + // causing SIGFPEs and bogus results on divisions where inactive lanes + // have 0-valued divisors. It ensures that any inactive lane instead + // has a divisor of 1. https://github.com/ispc/ispc/issues/2292 + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +SCALAR_FUN_ATTR uint16_t udiv16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +SCALAR_FUN_ATTR uint32_t udiv32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return x / ys; +} + +SCALAR_FUN_ATTR uint64_t udiv64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return x / ys; +} + +SCALAR_FUN_ATTR uint8_t udiv_up8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint16_t udiv_up16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint32_t udiv_up32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint64_t udiv_up64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint8_t umod8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +SCALAR_FUN_ATTR uint16_t umod16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return x % ys; +} + +SCALAR_FUN_ATTR uint32_t umod32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +SCALAR_FUN_ATTR uint64_t umod64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +SCALAR_FUN_ATTR uint8_t udiv_safe8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR uint16_t udiv_safe16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR uint32_t udiv_safe32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR uint64_t udiv_safe64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +SCALAR_FUN_ATTR uint8_t umod_safe8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +SCALAR_FUN_ATTR uint16_t umod_safe16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +SCALAR_FUN_ATTR uint32_t umod_safe32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +SCALAR_FUN_ATTR uint64_t umod_safe64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +SCALAR_FUN_ATTR int8_t sdiv8(int8_t x, int8_t y) { + int8_t ys = 1; + foreach_active(i){ + ys = y; + } + + int8_t q = x / ys; + int8_t r = x % ys; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int16_t sdiv16(int16_t x, int16_t y) { + int16_t ys = 1; + foreach_active(i){ + ys = y; + } + + int16_t q = x / ys; + int16_t r = x % ys; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int32_t sdiv32(int32_t x, int32_t y) { + int32_t ys = 1; + foreach_active(i){ + ys = y; + } + int32_t q = x / ys; + int32_t r = x % ys; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int64_t sdiv64(int64_t x, int64_t y) { + int64_t ys = 1; + foreach_active(i){ + ys = y; + } + + int64_t q = x / ys; + int64_t r = x % ys; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int8_t sdiv_up8(int8_t x, int8_t y) { + return sdiv8(x + y - 1, y); +} + +SCALAR_FUN_ATTR int16_t sdiv_up16(int16_t x, int16_t y) { + return sdiv16(x + y - 1, y); +} + +SCALAR_FUN_ATTR int32_t sdiv_up32(int32_t x, int32_t y) { + return sdiv32(x + y - 1, y); +} + +SCALAR_FUN_ATTR int64_t sdiv_up64(int64_t x, int64_t y) { + return sdiv64(x + y - 1, y); +} + +SCALAR_FUN_ATTR int8_t smod8(int8_t x, int8_t y) { + int8_t ys = 1; + foreach_active(i){ + ys = y; + } + + int8_t r = x % ys; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int16_t smod16(int16_t x, int16_t y) { + int16_t ys = 1; + foreach_active(i){ + ys = y; + } + + int16_t r = x % ys; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int32_t smod32(int32_t x, int32_t y) { + int32_t ys = 1; + foreach_active(i){ + ys = y; + } + + int32_t r = x % ys; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int64_t smod64(int64_t x, int64_t y) { + int64_t ys = 1; + foreach_active(i){ + ys = y; + } + + int64_t r = x % ys; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int8_t sdiv_safe8(int8_t x, int8_t y) { + return y == 0 ? 0 : sdiv8(x, y); +} + +SCALAR_FUN_ATTR int16_t sdiv_safe16(int16_t x, int16_t y) { + return y == 0 ? 0 : sdiv16(x, y); +} + +SCALAR_FUN_ATTR int32_t sdiv_safe32(int32_t x, int32_t y) { + return y == 0 ? 0 : sdiv32(x, y); +} + +SCALAR_FUN_ATTR int64_t sdiv_safe64(int64_t x, int64_t y) { + return y == 0 ? 0 : sdiv64(x, y); +} + +SCALAR_FUN_ATTR int8_t sdiv_up_safe8(int8_t x, int8_t y) { + return sdiv_safe8(x + y - 1, y); +} + +SCALAR_FUN_ATTR int16_t sdiv_up_safe16(int16_t x, int16_t y) { + return sdiv_safe16(x + y - 1, y); +} + +SCALAR_FUN_ATTR int32_t sdiv_up_safe32(int32_t x, int32_t y) { + return sdiv_safe32(x + y - 1, y); +} + +SCALAR_FUN_ATTR int64_t sdiv_up_safe64(int64_t x, int64_t y) { + return sdiv_safe64(x + y - 1, y); +} + +SCALAR_FUN_ATTR int8_t smod_safe8(int8_t x, int8_t y) { + return y == 0 ? 0 : smod8(x, y); +} + +SCALAR_FUN_ATTR int16_t smod_safe16(int16_t x, int16_t y) { + return y == 0 ? 0 : smod16(x, y); +} + +SCALAR_FUN_ATTR int32_t smod_safe32(int32_t x, int32_t y) { + return y == 0 ? 0 : smod32(x, y); +} + +SCALAR_FUN_ATTR int64_t smod_safe64(int64_t x, int64_t y) { + return y == 0 ? 0 : smod64(x, y); +} + +SCALAR_FUN_ATTR int8_t squot8(int8_t x, int8_t y) { + int8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +SCALAR_FUN_ATTR int16_t squot16(int16_t x, int16_t y) { + int16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +SCALAR_FUN_ATTR int32_t squot32(int32_t x, int32_t y) { + int32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +SCALAR_FUN_ATTR int64_t squot64(int64_t x, int64_t y) { + int64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +SCALAR_FUN_ATTR int8_t srem8(int8_t x, int8_t y) { + int8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +SCALAR_FUN_ATTR int16_t srem16(int16_t x, int16_t y) { + int16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +SCALAR_FUN_ATTR int32_t srem32(int32_t x, int32_t y) { + int32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +SCALAR_FUN_ATTR int64_t srem64(int64_t x, int64_t y) { + int8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +SCALAR_FUN_ATTR int8_t squot_safe8(int8_t x, int8_t y) { + int8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR int16_t squot_safe16(int16_t x, int16_t y) { + int16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR int32_t squot_safe32(int32_t x, int32_t y) { + int32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR int64_t squot_safe64(int64_t x, int64_t y) { + int64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +SCALAR_FUN_ATTR int8_t srem_safe8(int8_t x, int8_t y) { + int8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +SCALAR_FUN_ATTR int16_t srem_safe16(int16_t x, int16_t y) { + int16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +SCALAR_FUN_ATTR int32_t srem_safe32(int32_t x, int32_t y) { + int32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +SCALAR_FUN_ATTR int64_t srem_safe64(int64_t x, int64_t y) { + int64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +#else + +SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) { + return x / y; +} + +SCALAR_FUN_ATTR uint16_t udiv16(uint16_t x, uint16_t y) { + return x / y; +} + +SCALAR_FUN_ATTR uint32_t udiv32(uint32_t x, uint32_t y) { + return x / y; +} + +SCALAR_FUN_ATTR uint64_t udiv64(uint64_t x, uint64_t y) { + return x / y; +} + +SCALAR_FUN_ATTR uint8_t udiv_up8(uint8_t x, uint8_t y) { + return (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint16_t udiv_up16(uint16_t x, uint16_t y) { + return (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint32_t udiv_up32(uint32_t x, uint32_t y) { + return (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint64_t udiv_up64(uint64_t x, uint64_t y) { + return (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint8_t umod8(uint8_t x, uint8_t y) { + return x % y; +} + +SCALAR_FUN_ATTR uint16_t umod16(uint16_t x, uint16_t y) { + return x % y; +} + +SCALAR_FUN_ATTR uint32_t umod32(uint32_t x, uint32_t y) { + return x % y; +} + +SCALAR_FUN_ATTR uint64_t umod64(uint64_t x, uint64_t y) { + return x % y; +} + +SCALAR_FUN_ATTR uint8_t udiv_safe8(uint8_t x, uint8_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR uint16_t udiv_safe16(uint16_t x, uint16_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR uint32_t udiv_safe32(uint32_t x, uint32_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR uint64_t udiv_safe64(uint64_t x, uint64_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { + return y == 0 ? 0 : (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { + return y == 0 ? 0 : (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { + return y == 0 ? 0 : (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { + return y == 0 ? 0 : (x + y - 1) / y; +} + +SCALAR_FUN_ATTR uint8_t umod_safe8(uint8_t x, uint8_t y) { + return y == 0 ? 0 : x % y; +} + +SCALAR_FUN_ATTR uint16_t umod_safe16(uint16_t x, uint16_t y) { + return y == 0 ? 0 : x % y; +} + +SCALAR_FUN_ATTR uint32_t umod_safe32(uint32_t x, uint32_t y) { + return y == 0 ? 0 : x % y; +} + +SCALAR_FUN_ATTR uint64_t umod_safe64(uint64_t x, uint64_t y) { + return y == 0 ? 0 : x % y; +} + +SCALAR_FUN_ATTR int8_t sdiv8(int8_t x, int8_t y) { + int8_t q = x / y; + int8_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int16_t sdiv16(int16_t x, int16_t y) { + int16_t q = x / y; + int16_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int32_t sdiv32(int32_t x, int32_t y) { + int32_t q = x / y; + int32_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int64_t sdiv64(int64_t x, int64_t y) { + int64_t q = x / y; + int64_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} + +SCALAR_FUN_ATTR int8_t sdiv_up8(int8_t x, int8_t y) { + return sdiv8(x + y - 1, y); +} + +SCALAR_FUN_ATTR int16_t sdiv_up16(int16_t x, int16_t y) { + return sdiv16(x + y - 1, y); +} + +SCALAR_FUN_ATTR int32_t sdiv_up32(int32_t x, int32_t y) { + return sdiv32(x + y - 1, y); +} + +SCALAR_FUN_ATTR int64_t sdiv_up64(int64_t x, int64_t y) { + return sdiv64(x + y - 1, y); +} + +SCALAR_FUN_ATTR int8_t smod8(int8_t x, int8_t y) { + int8_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int16_t smod16(int16_t x, int16_t y) { + int16_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int32_t smod32(int32_t x, int32_t y) { + int32_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int64_t smod64(int64_t x, int64_t y) { + int64_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} + +SCALAR_FUN_ATTR int8_t sdiv_safe8(int8_t x, int8_t y) { + return y == 0 ? 0 : sdiv8(x, y); +} + +SCALAR_FUN_ATTR int16_t sdiv_safe16(int16_t x, int16_t y) { + return y == 0 ? 0 : sdiv16(x, y); +} + +SCALAR_FUN_ATTR int32_t sdiv_safe32(int32_t x, int32_t y) { + return y == 0 ? 0 : sdiv32(x, y); +} + +SCALAR_FUN_ATTR int64_t sdiv_safe64(int64_t x, int64_t y) { + return y == 0 ? 0 : sdiv64(x, y); +} + +SCALAR_FUN_ATTR int8_t sdiv_up_safe8(int8_t x, int8_t y) { + return sdiv_safe8(x + y - 1, y); +} + +SCALAR_FUN_ATTR int16_t sdiv_up_safe16(int16_t x, int16_t y) { + return sdiv_safe16(x + y - 1, y); +} + +SCALAR_FUN_ATTR int32_t sdiv_up_safe32(int32_t x, int32_t y) { + return sdiv_safe32(x + y - 1, y); +} + +SCALAR_FUN_ATTR int64_t sdiv_up_safe64(int64_t x, int64_t y) { + return sdiv_safe64(x + y - 1, y); +} + +SCALAR_FUN_ATTR int8_t smod_safe8(int8_t x, int8_t y) { + return y == 0 ? 0 : smod8(x, y); +} + +SCALAR_FUN_ATTR int16_t smod_safe16(int16_t x, int16_t y) { + return y == 0 ? 0 : smod16(x, y); +} + +SCALAR_FUN_ATTR int32_t smod_safe32(int32_t x, int32_t y) { + return y == 0 ? 0 : smod32(x, y); +} + +SCALAR_FUN_ATTR int64_t smod_safe64(int64_t x, int64_t y) { + return y == 0 ? 0 : smod64(x, y); +} + +SCALAR_FUN_ATTR int8_t squot8(int8_t x, int8_t y) { + return x / y; +} + +SCALAR_FUN_ATTR int16_t squot16(int16_t x, int16_t y) { + return x / y; +} + +SCALAR_FUN_ATTR int32_t squot32(int32_t x, int32_t y) { + return x / y; +} + +SCALAR_FUN_ATTR int64_t squot64(int64_t x, int64_t y) { + return x / y; +} + +SCALAR_FUN_ATTR int8_t srem8(int8_t x, int8_t y) { + return x % y; +} + +SCALAR_FUN_ATTR int16_t srem16(int16_t x, int16_t y) { + return x % y; +} + +SCALAR_FUN_ATTR int32_t srem32(int32_t x, int32_t y) { + return x % y; +} + +SCALAR_FUN_ATTR int64_t srem64(int64_t x, int64_t y) { + return x % y; +} + +SCALAR_FUN_ATTR int8_t squot_safe8(int8_t x, int8_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR int16_t squot_safe16(int16_t x, int16_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR int32_t squot_safe32(int32_t x, int32_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR int64_t squot_safe64(int64_t x, int64_t y) { + return y == 0 ? 0 : x / y; +} + +SCALAR_FUN_ATTR int8_t srem_safe8(int8_t x, int8_t y) { + return y == 0 ? 0 : x % y; +} + +SCALAR_FUN_ATTR int16_t srem_safe16(int16_t x, int16_t y) { + return y == 0 ? 0 : x % y; +} + +SCALAR_FUN_ATTR int32_t srem_safe32(int32_t x, int32_t y) { + return y == 0 ? 0 : x % y; +} + +SCALAR_FUN_ATTR int64_t srem_safe64(int64_t x, int64_t y) { + return y == 0 ? 0 : x % y; +} + +#endif + +SCALAR_FUN_ATTR int8_t smin8(int8_t x, int8_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR int16_t smin16(int16_t x, int16_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR int32_t smin32(int32_t x, int32_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR int64_t smin64(int64_t x, int64_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR uint8_t umin8(uint8_t x, uint8_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR uint16_t umin16(uint16_t x, uint16_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR uint32_t umin32(uint32_t x, uint32_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR uint64_t umin64(uint64_t x, uint64_t y) { + return x < y ? x : y; +} + +SCALAR_FUN_ATTR int8_t smax8(int8_t x, int8_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR int16_t smax16(int16_t x, int16_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR int32_t smax32(int32_t x, int32_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR int64_t smax64(int64_t x, int64_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR uint8_t umax8(uint8_t x, uint8_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR uint16_t umax16(uint16_t x, uint16_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR uint32_t umax32(uint32_t x, uint32_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR uint64_t umax64(uint64_t x, uint64_t y) { + return x < y ? y : x; +} + +SCALAR_FUN_ATTR uint8_t shl8(uint8_t x, uint8_t y) { + return (uint8_t)(x << y); +} + +SCALAR_FUN_ATTR uint16_t shl16(uint16_t x, uint16_t y) { + return (uint16_t)(x << y); +} + +SCALAR_FUN_ATTR uint32_t shl32(uint32_t x, uint32_t y) { + return x << y; +} + +SCALAR_FUN_ATTR uint64_t shl64(uint64_t x, uint64_t y) { + return x << y; +} + +SCALAR_FUN_ATTR uint8_t lshr8(uint8_t x, uint8_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR uint16_t lshr16(uint16_t x, uint16_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR uint32_t lshr32(uint32_t x, uint32_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR uint64_t lshr64(uint64_t x, uint64_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR int8_t ashr8(int8_t x, int8_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR int16_t ashr16(int16_t x, int16_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR int32_t ashr32(int32_t x, int32_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR int64_t ashr64(int64_t x, int64_t y) { + return x >> y; +} + +SCALAR_FUN_ATTR uint8_t and8(uint8_t x, uint8_t y) { + return x & y; +} + +SCALAR_FUN_ATTR uint16_t and16(uint16_t x, uint16_t y) { + return x & y; +} + +SCALAR_FUN_ATTR uint32_t and32(uint32_t x, uint32_t y) { + return x & y; +} + +SCALAR_FUN_ATTR uint64_t and64(uint64_t x, uint64_t y) { + return x & y; +} + +SCALAR_FUN_ATTR uint8_t or8(uint8_t x, uint8_t y) { + return x | y; +} + +SCALAR_FUN_ATTR uint16_t or16(uint16_t x, uint16_t y) { + return x | y; +} + +SCALAR_FUN_ATTR uint32_t or32(uint32_t x, uint32_t y) { + return x | y; +} + +SCALAR_FUN_ATTR uint64_t or64(uint64_t x, uint64_t y) { + return x | y; +} + +SCALAR_FUN_ATTR uint8_t xor8(uint8_t x, uint8_t y) { + return x ^ y; +} + +SCALAR_FUN_ATTR uint16_t xor16(uint16_t x, uint16_t y) { + return x ^ y; +} + +SCALAR_FUN_ATTR uint32_t xor32(uint32_t x, uint32_t y) { + return x ^ y; +} + +SCALAR_FUN_ATTR uint64_t xor64(uint64_t x, uint64_t y) { + return x ^ y; +} + +SCALAR_FUN_ATTR bool ult8(uint8_t x, uint8_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool ult16(uint16_t x, uint16_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool ult32(uint32_t x, uint32_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool ult64(uint64_t x, uint64_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool ule8(uint8_t x, uint8_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR bool ule16(uint16_t x, uint16_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR bool ule32(uint32_t x, uint32_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR bool ule64(uint64_t x, uint64_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR bool slt8(int8_t x, int8_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool slt16(int16_t x, int16_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool slt32(int32_t x, int32_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool slt64(int64_t x, int64_t y) { + return x < y; +} + +SCALAR_FUN_ATTR bool sle8(int8_t x, int8_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR bool sle16(int16_t x, int16_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR bool sle32(int32_t x, int32_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR bool sle64(int64_t x, int64_t y) { + return x <= y; +} + +SCALAR_FUN_ATTR uint8_t pow8(uint8_t x, uint8_t y) { + uint8_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} + +SCALAR_FUN_ATTR uint16_t pow16(uint16_t x, uint16_t y) { + uint16_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} + +SCALAR_FUN_ATTR uint32_t pow32(uint32_t x, uint32_t y) { + uint32_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} + +SCALAR_FUN_ATTR uint64_t pow64(uint64_t x, uint64_t y) { + uint64_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} + +SCALAR_FUN_ATTR bool itob_i8_bool(int8_t x) { + return x != 0; +} + +SCALAR_FUN_ATTR bool itob_i16_bool(int16_t x) { + return x != 0; +} + +SCALAR_FUN_ATTR bool itob_i32_bool(int32_t x) { + return x != 0; +} + +SCALAR_FUN_ATTR bool itob_i64_bool(int64_t x) { + return x != 0; +} + +SCALAR_FUN_ATTR int8_t btoi_bool_i8(bool x) { + return x; +} + +SCALAR_FUN_ATTR int16_t btoi_bool_i16(bool x) { + return x; +} + +SCALAR_FUN_ATTR int32_t btoi_bool_i32(bool x) { + return x; +} + +SCALAR_FUN_ATTR int64_t btoi_bool_i64(bool x) { + return x; +} + +#define sext_i8_i8(x) ((int8_t) (int8_t) (x)) +#define sext_i8_i16(x) ((int16_t) (int8_t) (x)) +#define sext_i8_i32(x) ((int32_t) (int8_t) (x)) +#define sext_i8_i64(x) ((int64_t) (int8_t) (x)) +#define sext_i16_i8(x) ((int8_t) (int16_t) (x)) +#define sext_i16_i16(x) ((int16_t) (int16_t) (x)) +#define sext_i16_i32(x) ((int32_t) (int16_t) (x)) +#define sext_i16_i64(x) ((int64_t) (int16_t) (x)) +#define sext_i32_i8(x) ((int8_t) (int32_t) (x)) +#define sext_i32_i16(x) ((int16_t) (int32_t) (x)) +#define sext_i32_i32(x) ((int32_t) (int32_t) (x)) +#define sext_i32_i64(x) ((int64_t) (int32_t) (x)) +#define sext_i64_i8(x) ((int8_t) (int64_t) (x)) +#define sext_i64_i16(x) ((int16_t) (int64_t) (x)) +#define sext_i64_i32(x) ((int32_t) (int64_t) (x)) +#define sext_i64_i64(x) ((int64_t) (int64_t) (x)) +#define zext_i8_i8(x) ((int8_t) (uint8_t) (x)) +#define zext_i8_i16(x) ((int16_t) (uint8_t) (x)) +#define zext_i8_i32(x) ((int32_t) (uint8_t) (x)) +#define zext_i8_i64(x) ((int64_t) (uint8_t) (x)) +#define zext_i16_i8(x) ((int8_t) (uint16_t) (x)) +#define zext_i16_i16(x) ((int16_t) (uint16_t) (x)) +#define zext_i16_i32(x) ((int32_t) (uint16_t) (x)) +#define zext_i16_i64(x) ((int64_t) (uint16_t) (x)) +#define zext_i32_i8(x) ((int8_t) (uint32_t) (x)) +#define zext_i32_i16(x) ((int16_t) (uint32_t) (x)) +#define zext_i32_i32(x) ((int32_t) (uint32_t) (x)) +#define zext_i32_i64(x) ((int64_t) (uint32_t) (x)) +#define zext_i64_i8(x) ((int8_t) (uint64_t) (x)) +#define zext_i64_i16(x) ((int16_t) (uint64_t) (x)) +#define zext_i64_i32(x) ((int32_t) (uint64_t) (x)) +#define zext_i64_i64(x) ((int64_t) (uint64_t) (x)) + +SCALAR_FUN_ATTR int8_t abs8(int8_t x) { + return (int8_t)abs(x); +} + +SCALAR_FUN_ATTR int16_t abs16(int16_t x) { + return (int16_t)abs(x); +} + +SCALAR_FUN_ATTR int32_t abs32(int32_t x) { + return abs(x); +} + +SCALAR_FUN_ATTR int64_t abs64(int64_t x) { +#if defined(__OPENCL_VERSION__) || defined(ISPC) + return abs(x); +#else + return llabs(x); +#endif +} + +#if defined(__OPENCL_VERSION__) +SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) { + return popcount(x); +} + +SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) { + return popcount(x); +} + +SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) { + return popcount(x); +} + +SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) { + return popcount(x); +} +#elif defined(__CUDA_ARCH__) + +SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) { + return __popc(zext_i8_i32(x)); +} + +SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) { + return __popc(zext_i16_i32(x)); +} + +SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) { + return __popc(x); +} + +SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) { + return __popcll(x); +} + +#else // Not OpenCL or CUDA, but plain C. + +SCALAR_FUN_ATTR int32_t futrts_popc8(uint8_t x) { + int c = 0; + for (; x; ++c) { x &= x - 1; } + return c; +} + +SCALAR_FUN_ATTR int32_t futrts_popc16(uint16_t x) { + int c = 0; + for (; x; ++c) { x &= x - 1; } + return c; +} + +SCALAR_FUN_ATTR int32_t futrts_popc32(uint32_t x) { + int c = 0; + for (; x; ++c) { x &= x - 1; } + return c; +} + +SCALAR_FUN_ATTR int32_t futrts_popc64(uint64_t x) { + int c = 0; + for (; x; ++c) { x &= x - 1; } + return c; +} +#endif + +#if defined(__OPENCL_VERSION__) +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8 ( uint8_t a, uint8_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); } +#elif defined(__CUDA_ARCH__) +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); } +SCALAR_FUN_ATTR uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); } +#elif ISPC +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { + uint64_t ah = a >> 32; + uint64_t al = a & 0xffffffff; + uint64_t bh = b >> 32; + uint64_t bl = b & 0xffffffff; + + uint64_t p1 = al * bl; + uint64_t p2 = al * bh; + uint64_t p3 = ah * bl; + uint64_t p4 = ah * bh; + + uint64_t p1h = p1 >> 32; + uint64_t p2h = p2 >> 32; + uint64_t p3h = p3 >> 32; + uint64_t p2l = p2 & 0xffffffff; + uint64_t p3l = p3 & 0xffffffff; + + uint64_t l = p1h + p2l + p3l; + uint64_t m = (p2 >> 32) + (p3 >> 32); + uint64_t h = (l >> 32) + m + p4; + + return h; +} +SCALAR_FUN_ATTR int8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } +SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) { + uint64_t ah = a >> 32; + uint64_t al = a & 0xffffffff; + uint64_t bh = b >> 32; + uint64_t bl = b & 0xffffffff; + + uint64_t p1 = al * bl; + int64_t p2 = al * bh; + int64_t p3 = ah * bl; + uint64_t p4 = ah * bh; + + uint64_t p1h = p1 >> 32; + uint64_t p2h = p2 >> 32; + uint64_t p3h = p3 >> 32; + uint64_t p2l = p2 & 0xffffffff; + uint64_t p3l = p3 & 0xffffffff; + + uint64_t l = p1h + p2l + p3l; + uint64_t m = (p2 >> 32) + (p3 >> 32); + uint64_t h = (l >> 32) + m + p4; + + return h; +} + +#else // Not OpenCL, ISPC, or CUDA, but plain C. +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; } +SCALAR_FUN_ATTR int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } +SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } +SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; } +SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; } +#endif + +#if defined(__OPENCL_VERSION__) +SCALAR_FUN_ATTR uint8_t futrts_umad_hi8 ( uint8_t a, uint8_t b, uint8_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint8_t futrts_smad_hi8( int8_t a, int8_t b, int8_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); } +#else // Not OpenCL + +SCALAR_FUN_ATTR uint8_t futrts_umad_hi8( uint8_t a, uint8_t b, uint8_t c) { return futrts_umul_hi8(a, b) + c; } +SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; } +SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; } +SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; } +SCALAR_FUN_ATTR uint8_t futrts_smad_hi8 ( int8_t a, int8_t b, int8_t c) { return futrts_smul_hi8(a, b) + c; } +SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; } +SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; } +SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; } +#endif + +#if defined(__OPENCL_VERSION__) +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { + return clz(x); +} + +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { + return clz(x); +} + +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { + return clz(x); +} + +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { + return clz(x); +} + +#elif defined(__CUDA_ARCH__) + +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { + return __clz(zext_i8_i32(x)) - 24; +} + +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { + return __clz(zext_i16_i32(x)) - 16; +} + +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { + return __clz(x); +} + +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { + return __clzll(x); +} + +#elif ISPC + +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { + return count_leading_zeros((int32_t)(uint8_t)x)-24; +} + +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { + return count_leading_zeros((int32_t)(uint16_t)x)-16; +} + +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { + return count_leading_zeros(x); +} + +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { + return count_leading_zeros(x); +} + +#else // Not OpenCL, ISPC or CUDA, but plain C. + +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { + return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24; +} + +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { + return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16; +} + +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { + return x == 0 ? 32 : __builtin_clz((uint32_t)x); +} + +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { + return x == 0 ? 64 : __builtin_clzll((uint64_t)x); +} +#endif + +#if defined(__OPENCL_VERSION__) +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { + int i = 0; + for (; i < 8 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { + int i = 0; + for (; i < 16 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { + int i = 0; + for (; i < 32 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { + int i = 0; + for (; i < 64 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} + +#elif defined(__CUDA_ARCH__) + +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { + int y = __ffs(x); + return y == 0 ? 8 : y - 1; +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { + int y = __ffs(x); + return y == 0 ? 16 : y - 1; +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { + int y = __ffs(x); + return y == 0 ? 32 : y - 1; +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { + int y = __ffsll(x); + return y == 0 ? 64 : y - 1; +} + +#elif ISPC + +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { + return x == 0 ? 8 : count_trailing_zeros((int32_t)x); +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { + return x == 0 ? 16 : count_trailing_zeros((int32_t)x); +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { + return count_trailing_zeros(x); +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { + return count_trailing_zeros(x); +} + +#else // Not OpenCL or CUDA, but plain C. + +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { + return x == 0 ? 8 : __builtin_ctz((uint32_t)x); +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { + return x == 0 ? 16 : __builtin_ctz((uint32_t)x); +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { + return x == 0 ? 32 : __builtin_ctz((uint32_t)x); +} + +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { + return x == 0 ? 64 : __builtin_ctzll((uint64_t)x); +} +#endif + +SCALAR_FUN_ATTR float fdiv32(float x, float y) { + return x / y; +} + +SCALAR_FUN_ATTR float fadd32(float x, float y) { + return x + y; +} + +SCALAR_FUN_ATTR float fsub32(float x, float y) { + return x - y; +} + +SCALAR_FUN_ATTR float fmul32(float x, float y) { + return x * y; +} + +SCALAR_FUN_ATTR bool cmplt32(float x, float y) { + return x < y; +} + +SCALAR_FUN_ATTR bool cmple32(float x, float y) { + return x <= y; +} + +SCALAR_FUN_ATTR float sitofp_i8_f32(int8_t x) { + return (float) x; +} + +SCALAR_FUN_ATTR float sitofp_i16_f32(int16_t x) { + return (float) x; +} + +SCALAR_FUN_ATTR float sitofp_i32_f32(int32_t x) { + return (float) x; +} + +SCALAR_FUN_ATTR float sitofp_i64_f32(int64_t x) { + return (float) x; +} + +SCALAR_FUN_ATTR float uitofp_i8_f32(uint8_t x) { + return (float) x; +} + +SCALAR_FUN_ATTR float uitofp_i16_f32(uint16_t x) { + return (float) x; +} + +SCALAR_FUN_ATTR float uitofp_i32_f32(uint32_t x) { + return (float) x; +} + +SCALAR_FUN_ATTR float uitofp_i64_f32(uint64_t x) { + return (float) x; +} + +#ifdef __OPENCL_VERSION__ +SCALAR_FUN_ATTR float fabs32(float x) { + return fabs(x); +} + +SCALAR_FUN_ATTR float fmax32(float x, float y) { + return fmax(x, y); +} + +SCALAR_FUN_ATTR float fmin32(float x, float y) { + return fmin(x, y); +} + +SCALAR_FUN_ATTR float fpow32(float x, float y) { + return pow(x, y); +} + +#elif ISPC + +SCALAR_FUN_ATTR float fabs32(float x) { + return abs(x); +} + +SCALAR_FUN_ATTR float fmax32(float x, float y) { + return isnan(x) ? y : isnan(y) ? x : max(x, y); +} + +SCALAR_FUN_ATTR float fmin32(float x, float y) { + return isnan(x) ? y : isnan(y) ? x : min(x, y); +} + +SCALAR_FUN_ATTR float fpow32(float a, float b) { + float ret; + foreach_active (i) { + uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); + ret = insert(ret, i, r); + } + return ret; +} + +#else // Not OpenCL, but CUDA or plain C. + +SCALAR_FUN_ATTR float fabs32(float x) { + return fabsf(x); +} + +SCALAR_FUN_ATTR float fmax32(float x, float y) { + return fmaxf(x, y); +} + +SCALAR_FUN_ATTR float fmin32(float x, float y) { + return fminf(x, y); +} + +SCALAR_FUN_ATTR float fpow32(float x, float y) { + return powf(x, y); +} +#endif + +SCALAR_FUN_ATTR bool futrts_isnan32(float x) { + return isnan(x); +} + +#if ISPC + +SCALAR_FUN_ATTR bool futrts_isinf32(float x) { + return !isnan(x) && isnan(x - x); +} + +SCALAR_FUN_ATTR bool futrts_isfinite32(float x) { + return !isnan(x) && !futrts_isinf32(x); +} + +#else + +SCALAR_FUN_ATTR bool futrts_isinf32(float x) { + return isinf(x); +} + +#endif + +SCALAR_FUN_ATTR int8_t fptosi_f32_i8(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (int8_t) x; + } +} + +SCALAR_FUN_ATTR int16_t fptosi_f32_i16(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (int16_t) x; + } +} + +SCALAR_FUN_ATTR int32_t fptosi_f32_i32(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (int32_t) x; + } +} + +SCALAR_FUN_ATTR int64_t fptosi_f32_i64(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (int64_t) x; + }; +} + +SCALAR_FUN_ATTR uint8_t fptoui_f32_i8(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (uint8_t) (int8_t) x; + } +} + +SCALAR_FUN_ATTR uint16_t fptoui_f32_i16(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (uint16_t) (int16_t) x; + } +} + +SCALAR_FUN_ATTR uint32_t fptoui_f32_i32(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (uint32_t) (int32_t) x; + } +} + +SCALAR_FUN_ATTR uint64_t fptoui_f32_i64(float x) { + if (futrts_isnan32(x) || futrts_isinf32(x)) { + return 0; + } else { + return (uint64_t) (int64_t) x; + } +} + +SCALAR_FUN_ATTR bool ftob_f32_bool(float x) { + return x != 0; +} + +SCALAR_FUN_ATTR float btof_bool_f32(bool x) { + return x ? 1 : 0; +} + +#ifdef __OPENCL_VERSION__ +SCALAR_FUN_ATTR float futrts_log32(float x) { + return log(x); +} + +SCALAR_FUN_ATTR float futrts_log2_32(float x) { + return log2(x); +} + +SCALAR_FUN_ATTR float futrts_log10_32(float x) { + return log10(x); +} + +SCALAR_FUN_ATTR float futrts_log1p_32(float x) { + return log1p(x); +} + +SCALAR_FUN_ATTR float futrts_sqrt32(float x) { + return sqrt(x); +} + +SCALAR_FUN_ATTR float futrts_cbrt32(float x) { + return cbrt(x); +} + +SCALAR_FUN_ATTR float futrts_exp32(float x) { + return exp(x); +} + +SCALAR_FUN_ATTR float futrts_cos32(float x) { + return cos(x); +} + +SCALAR_FUN_ATTR float futrts_sin32(float x) { + return sin(x); +} + +SCALAR_FUN_ATTR float futrts_tan32(float x) { + return tan(x); +} + +SCALAR_FUN_ATTR float futrts_acos32(float x) { + return acos(x); +} + +SCALAR_FUN_ATTR float futrts_asin32(float x) { + return asin(x); +} + +SCALAR_FUN_ATTR float futrts_atan32(float x) { + return atan(x); +} + +SCALAR_FUN_ATTR float futrts_cosh32(float x) { + return cosh(x); +} + +SCALAR_FUN_ATTR float futrts_sinh32(float x) { + return sinh(x); +} + +SCALAR_FUN_ATTR float futrts_tanh32(float x) { + return tanh(x); +} + +SCALAR_FUN_ATTR float futrts_acosh32(float x) { + return acosh(x); +} + +SCALAR_FUN_ATTR float futrts_asinh32(float x) { + return asinh(x); +} + +SCALAR_FUN_ATTR float futrts_atanh32(float x) { + return atanh(x); +} + +SCALAR_FUN_ATTR float futrts_atan2_32(float x, float y) { + return atan2(x, y); +} + +SCALAR_FUN_ATTR float futrts_hypot32(float x, float y) { + return hypot(x, y); +} + +SCALAR_FUN_ATTR float futrts_gamma32(float x) { + return tgamma(x); +} + +SCALAR_FUN_ATTR float futrts_lgamma32(float x) { + return lgamma(x); +} + +SCALAR_FUN_ATTR float futrts_erf32(float x) { + return erf(x); +} + +SCALAR_FUN_ATTR float futrts_erfc32(float x) { + return erfc(x); +} + +SCALAR_FUN_ATTR float fmod32(float x, float y) { + return fmod(x, y); +} + +SCALAR_FUN_ATTR float futrts_round32(float x) { + return rint(x); +} + +SCALAR_FUN_ATTR float futrts_floor32(float x) { + return floor(x); +} + +SCALAR_FUN_ATTR float futrts_ceil32(float x) { + return ceil(x); +} + +SCALAR_FUN_ATTR float futrts_nextafter32(float x, float y) { + return nextafter(x, y); +} + +SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) { + return mix(v0, v1, t); +} + +SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) { + return ldexp(x, y); +} + +SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) { + return copysign(x, y); +} + +SCALAR_FUN_ATTR float futrts_mad32(float a, float b, float c) { + return mad(a, b, c); +} + +SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) { + return fma(a, b, c); +} + +#elif ISPC + +SCALAR_FUN_ATTR float futrts_log32(float x) { + return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x; +} + +SCALAR_FUN_ATTR float futrts_log2_32(float x) { + return futrts_log32(x) / log(2.0f); +} + +SCALAR_FUN_ATTR float futrts_log10_32(float x) { + return futrts_log32(x) / log(10.0f); +} + +SCALAR_FUN_ATTR float futrts_log1p_32(float x) { + if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f; + float y = 1.0f + x; + float z = y - 1.0f; + return log(y) - (z-x)/y; +} + +SCALAR_FUN_ATTR float futrts_sqrt32(float x) { + return sqrt(x); +} + +extern "C" unmasked uniform float cbrtf(uniform float); +SCALAR_FUN_ATTR float futrts_cbrt32(float x) { + float res; + foreach_active (i) { + uniform float r = cbrtf(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR float futrts_exp32(float x) { + return exp(x); +} + +SCALAR_FUN_ATTR float futrts_cos32(float x) { + return cos(x); +} + +SCALAR_FUN_ATTR float futrts_sin32(float x) { + return sin(x); +} + +SCALAR_FUN_ATTR float futrts_tan32(float x) { + return tan(x); +} + +SCALAR_FUN_ATTR float futrts_acos32(float x) { + return acos(x); +} + +SCALAR_FUN_ATTR float futrts_asin32(float x) { + return asin(x); +} + +SCALAR_FUN_ATTR float futrts_atan32(float x) { + return atan(x); +} + +SCALAR_FUN_ATTR float futrts_cosh32(float x) { + return (exp(x)+exp(-x)) / 2.0f; +} + +SCALAR_FUN_ATTR float futrts_sinh32(float x) { + return (exp(x)-exp(-x)) / 2.0f; +} + +SCALAR_FUN_ATTR float futrts_tanh32(float x) { + return futrts_sinh32(x)/futrts_cosh32(x); +} + +SCALAR_FUN_ATTR float futrts_acosh32(float x) { + float f = x+sqrt(x*x-1); + if(futrts_isfinite32(f)) return log(f); + return f; +} + +SCALAR_FUN_ATTR float futrts_asinh32(float x) { + float f = x+sqrt(x*x+1); + if(futrts_isfinite32(f)) return log(f); + return f; + +} + +SCALAR_FUN_ATTR float futrts_atanh32(float x) { + float f = (1+x)/(1-x); + if(futrts_isfinite32(f)) return log(f)/2.0f; + return f; + +} + +SCALAR_FUN_ATTR float futrts_atan2_32(float x, float y) { + return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y); +} + +SCALAR_FUN_ATTR float futrts_hypot32(float x, float y) { + if (futrts_isfinite32(x) && futrts_isfinite32(y)) { + x = abs(x); + y = abs(y); + float a; + float b; + if (x >= y){ + a = x; + b = y; + } else { + a = y; + b = x; + } + if(b == 0){ + return a; + } + + int e; + float an; + float bn; + an = frexp (a, &e); + bn = ldexp (b, - e); + float cn; + cn = sqrt (an * an + bn * bn); + return ldexp (cn, e); + } else { + if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY; + else return x + y; + } + +} + +extern "C" unmasked uniform float tgammaf(uniform float x); +SCALAR_FUN_ATTR float futrts_gamma32(float x) { + float res; + foreach_active (i) { + uniform float r = tgammaf(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform float lgammaf(uniform float x); +SCALAR_FUN_ATTR float futrts_lgamma32(float x) { + float res; + foreach_active (i) { + uniform float r = lgammaf(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform float erff(uniform float x); +SCALAR_FUN_ATTR float futrts_erf32(float x) { + float res; + foreach_active (i) { + uniform float r = erff(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform float erfcf(uniform float x); +SCALAR_FUN_ATTR float futrts_erfc32(float x) { + float res; + foreach_active (i) { + uniform float r = erfcf(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR float fmod32(float x, float y) { + return x - y * trunc(x/y); +} + +SCALAR_FUN_ATTR float futrts_round32(float x) { + return round(x); +} + +SCALAR_FUN_ATTR float futrts_floor32(float x) { + return floor(x); +} + +SCALAR_FUN_ATTR float futrts_ceil32(float x) { + return ceil(x); +} + +extern "C" unmasked uniform float nextafterf(uniform float x, uniform float y); +SCALAR_FUN_ATTR float futrts_nextafter32(float x, float y) { + float res; + foreach_active (i) { + uniform float r = nextafterf(extract(x, i), extract(y, i)); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) { + return v0 + (v1 - v0) * t; +} + +SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) { + return x * pow((double)2.0, (double)y); +} + +SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) { + int32_t xb = futrts_to_bits32(x); + int32_t yb = futrts_to_bits32(y); + return futrts_from_bits32((xb & ~(1<<31)) | (yb & (1<<31))); +} + +SCALAR_FUN_ATTR float futrts_mad32(float a, float b, float c) { + return a * b + c; +} + +SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) { + return a * b + c; +} + +#else // Not OpenCL or ISPC, but CUDA or plain C. + +SCALAR_FUN_ATTR float futrts_log32(float x) { + return logf(x); +} + +SCALAR_FUN_ATTR float futrts_log2_32(float x) { + return log2f(x); +} + +SCALAR_FUN_ATTR float futrts_log10_32(float x) { + return log10f(x); +} + +SCALAR_FUN_ATTR float futrts_log1p_32(float x) { + return log1pf(x); +} + +SCALAR_FUN_ATTR float futrts_sqrt32(float x) { + return sqrtf(x); +} + +SCALAR_FUN_ATTR float futrts_cbrt32(float x) { + return cbrtf(x); +} + +SCALAR_FUN_ATTR float futrts_exp32(float x) { + return expf(x); +} + +SCALAR_FUN_ATTR float futrts_cos32(float x) { + return cosf(x); +} + +SCALAR_FUN_ATTR float futrts_sin32(float x) { + return sinf(x); +} + +SCALAR_FUN_ATTR float futrts_tan32(float x) { + return tanf(x); +} + +SCALAR_FUN_ATTR float futrts_acos32(float x) { + return acosf(x); +} + +SCALAR_FUN_ATTR float futrts_asin32(float x) { + return asinf(x); +} + +SCALAR_FUN_ATTR float futrts_atan32(float x) { + return atanf(x); +} + +SCALAR_FUN_ATTR float futrts_cosh32(float x) { + return coshf(x); +} + +SCALAR_FUN_ATTR float futrts_sinh32(float x) { + return sinhf(x); +} + +SCALAR_FUN_ATTR float futrts_tanh32(float x) { + return tanhf(x); +} + +SCALAR_FUN_ATTR float futrts_acosh32(float x) { + return acoshf(x); +} + +SCALAR_FUN_ATTR float futrts_asinh32(float x) { + return asinhf(x); +} + +SCALAR_FUN_ATTR float futrts_atanh32(float x) { + return atanhf(x); +} + +SCALAR_FUN_ATTR float futrts_atan2_32(float x, float y) { + return atan2f(x, y); +} + +SCALAR_FUN_ATTR float futrts_hypot32(float x, float y) { + return hypotf(x, y); +} + +SCALAR_FUN_ATTR float futrts_gamma32(float x) { + return tgammaf(x); +} + +SCALAR_FUN_ATTR float futrts_lgamma32(float x) { + return lgammaf(x); +} + +SCALAR_FUN_ATTR float futrts_erf32(float x) { + return erff(x); +} + +SCALAR_FUN_ATTR float futrts_erfc32(float x) { + return erfcf(x); +} + +SCALAR_FUN_ATTR float fmod32(float x, float y) { + return fmodf(x, y); +} + +SCALAR_FUN_ATTR float futrts_round32(float x) { + return rintf(x); +} + +SCALAR_FUN_ATTR float futrts_floor32(float x) { + return floorf(x); +} + +SCALAR_FUN_ATTR float futrts_ceil32(float x) { + return ceilf(x); +} + +SCALAR_FUN_ATTR float futrts_nextafter32(float x, float y) { + return nextafterf(x, y); +} + +SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) { + return v0 + (v1 - v0) * t; +} + +SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) { + return ldexpf(x, y); +} + +SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) { + return copysignf(x, y); +} + +SCALAR_FUN_ATTR float futrts_mad32(float a, float b, float c) { + return a * b + c; +} + +SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) { + return fmaf(a, b, c); +} +#endif + +#if ISPC +SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x) { + return intbits(x); +} + +SCALAR_FUN_ATTR float futrts_from_bits32(int32_t x) { + return floatbits(x); +} +#else +SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x) { + union { + float f; + int32_t t; + } p; + + p.f = x; + return p.t; +} + +SCALAR_FUN_ATTR float futrts_from_bits32(int32_t x) { + union { + int32_t f; + float t; + } p; + + p.f = x; + return p.t; +} +#endif + +SCALAR_FUN_ATTR float fsignum32(float x) { + return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0); +} + +#ifdef FUTHARK_F64_ENABLED + +SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x); +SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x); + +#if ISPC +SCALAR_FUN_ATTR bool futrts_isinf64(float x) { + return !isnan(x) && isnan(x - x); +} + +SCALAR_FUN_ATTR bool futrts_isfinite64(float x) { + return !isnan(x) && !futrts_isinf64(x); +} + +SCALAR_FUN_ATTR double fdiv64(double x, double y) { + return x / y; +} + +SCALAR_FUN_ATTR double fadd64(double x, double y) { + return x + y; +} + +SCALAR_FUN_ATTR double fsub64(double x, double y) { + return x - y; +} + +SCALAR_FUN_ATTR double fmul64(double x, double y) { + return x * y; +} + +SCALAR_FUN_ATTR bool cmplt64(double x, double y) { + return x < y; +} + +SCALAR_FUN_ATTR bool cmple64(double x, double y) { + return x <= y; +} + +SCALAR_FUN_ATTR double sitofp_i8_f64(int8_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double sitofp_i16_f64(int16_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double sitofp_i32_f64(int32_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double sitofp_i64_f64(int64_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i8_f64(uint8_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i16_f64(uint16_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i32_f64(uint32_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i64_f64(uint64_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double fabs64(double x) { + return abs(x); +} + +SCALAR_FUN_ATTR double fmax64(double x, double y) { + return isnan(x) ? y : isnan(y) ? x : max(x, y); +} + +SCALAR_FUN_ATTR double fmin64(double x, double y) { + return isnan(x) ? y : isnan(y) ? x : min(x, y); +} + +SCALAR_FUN_ATTR double fpow64(double a, double b) { + float ret; + foreach_active (i) { + uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); + ret = insert(ret, i, r); + } + return ret; +} + +SCALAR_FUN_ATTR double futrts_log64(double x) { + return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x; +} + +SCALAR_FUN_ATTR double futrts_log2_64(double x) { + return futrts_log64(x)/log(2.0d); +} + +SCALAR_FUN_ATTR double futrts_log10_64(double x) { + return futrts_log64(x)/log(10.0d); +} + +SCALAR_FUN_ATTR double futrts_log1p_64(double x) { + if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d; + double y = 1.0d + x; + double z = y - 1.0d; + return log(y) - (z-x)/y; +} + +SCALAR_FUN_ATTR double futrts_sqrt64(double x) { + return sqrt(x); +} + +extern "C" unmasked uniform double cbrt(uniform double); +SCALAR_FUN_ATTR double futrts_cbrt64(double x) { + double res; + foreach_active (i) { + uniform double r = cbrtf(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR double futrts_exp64(double x) { + return exp(x); +} + +SCALAR_FUN_ATTR double futrts_cos64(double x) { + return cos(x); +} + +SCALAR_FUN_ATTR double futrts_sin64(double x) { + return sin(x); +} + +SCALAR_FUN_ATTR double futrts_tan64(double x) { + return tan(x); +} + +SCALAR_FUN_ATTR double futrts_acos64(double x) { + return acos(x); +} + +SCALAR_FUN_ATTR double futrts_asin64(double x) { + return asin(x); +} + +SCALAR_FUN_ATTR double futrts_atan64(double x) { + return atan(x); +} + +SCALAR_FUN_ATTR double futrts_cosh64(double x) { + return (exp(x)+exp(-x)) / 2.0d; +} + +SCALAR_FUN_ATTR double futrts_sinh64(double x) { + return (exp(x)-exp(-x)) / 2.0d; +} + +SCALAR_FUN_ATTR double futrts_tanh64(double x) { + return futrts_sinh64(x)/futrts_cosh64(x); +} + +SCALAR_FUN_ATTR double futrts_acosh64(double x) { + double f = x+sqrt(x*x-1.0d); + if(futrts_isfinite64(f)) return log(f); + return f; +} + +SCALAR_FUN_ATTR double futrts_asinh64(double x) { + double f = x+sqrt(x*x+1.0d); + if(futrts_isfinite64(f)) return log(f); + return f; +} + +SCALAR_FUN_ATTR double futrts_atanh64(double x) { + double f = (1.0d+x)/(1.0d-x); + if(futrts_isfinite64(f)) return log(f)/2.0d; + return f; + +} + +SCALAR_FUN_ATTR double futrts_atan2_64(double x, double y) { + return atan2(x, y); +} + +extern "C" unmasked uniform double hypot(uniform double x, uniform double y); +SCALAR_FUN_ATTR double futrts_hypot64(double x, double y) { + double res; + foreach_active (i) { + uniform double r = hypot(extract(x, i), extract(y, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform double tgamma(uniform double x); +SCALAR_FUN_ATTR double futrts_gamma64(double x) { + double res; + foreach_active (i) { + uniform double r = tgamma(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform double lgamma(uniform double x); +SCALAR_FUN_ATTR double futrts_lgamma64(double x) { + double res; + foreach_active (i) { + uniform double r = lgamma(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform double erf(uniform double x); +SCALAR_FUN_ATTR double futrts_erf64(double x) { + double res; + foreach_active (i) { + uniform double r = erf(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform double erfc(uniform double x); +SCALAR_FUN_ATTR double futrts_erfc64(double x) { + double res; + foreach_active (i) { + uniform double r = erfc(extract(x, i)); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR double futrts_fma64(double a, double b, double c) { + return a * b + c; +} + +SCALAR_FUN_ATTR double futrts_round64(double x) { + return round(x); +} + +SCALAR_FUN_ATTR double futrts_ceil64(double x) { + return ceil(x); +} + +extern "C" unmasked uniform double nextafter(uniform float x, uniform double y); +SCALAR_FUN_ATTR float futrts_nextafter64(double x, double y) { + double res; + foreach_active (i) { + uniform double r = nextafter(extract(x, i), extract(y, i)); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR double futrts_floor64(double x) { + return floor(x); +} + +SCALAR_FUN_ATTR bool futrts_isnan64(double x) { + return isnan(x); +} + +SCALAR_FUN_ATTR int8_t fptosi_f64_i8(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int8_t) x; + } +} + +SCALAR_FUN_ATTR int16_t fptosi_f64_i16(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int16_t) x; + } +} + +SCALAR_FUN_ATTR int32_t fptosi_f64_i32(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int32_t) x; + } +} + +SCALAR_FUN_ATTR int64_t fptosi_f64_i64(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int64_t) x; + } +} + +SCALAR_FUN_ATTR uint8_t fptoui_f64_i8(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint8_t) (int8_t) x; + } +} + +SCALAR_FUN_ATTR uint16_t fptoui_f64_i16(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint16_t) (int16_t) x; + } +} + +SCALAR_FUN_ATTR uint32_t fptoui_f64_i32(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint32_t) (int32_t) x; + } +} + +SCALAR_FUN_ATTR uint64_t fptoui_f64_i64(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint64_t) (int64_t) x; + } +} + +SCALAR_FUN_ATTR bool ftob_f64_bool(double x) { + return x != 0.0; +} + +SCALAR_FUN_ATTR double btof_bool_f64(bool x) { + return x ? 1.0 : 0.0; +} + +SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x) { + int64_t res; + foreach_active (i) { + uniform double tmp = extract(x, i); + uniform int64_t r = *((uniform int64_t* uniform)&tmp); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x) { + double res; + foreach_active (i) { + uniform int64_t tmp = extract(x, i); + uniform double r = *((uniform double* uniform)&tmp); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR double fmod64(double x, double y) { + return x - y * trunc(x/y); +} + +SCALAR_FUN_ATTR double fsignum64(double x) { + return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d); +} + +SCALAR_FUN_ATTR double futrts_lerp64(double v0, double v1, double t) { + return v0 + (v1 - v0) * t; +} + +SCALAR_FUN_ATTR double futrts_ldexp64(double x, int32_t y) { + return x * pow((double)2.0, (double)y); +} + +SCALAR_FUN_ATTR double futrts_copysign64(double x, double y) { + int64_t xb = futrts_to_bits64(x); + int64_t yb = futrts_to_bits64(y); + return futrts_from_bits64((xb & ~(((int64_t)1)<<63)) | (yb & (((int64_t)1)<<63))); +} + +SCALAR_FUN_ATTR double futrts_mad64(double a, double b, double c) { + return a * b + c; +} + +SCALAR_FUN_ATTR float fpconv_f32_f32(float x) { + return (float) x; +} + +SCALAR_FUN_ATTR double fpconv_f32_f64(float x) { + return (double) x; +} + +SCALAR_FUN_ATTR float fpconv_f64_f32(double x) { + return (float) x; +} + +SCALAR_FUN_ATTR double fpconv_f64_f64(double x) { + return (double) x; +} + +#else + +SCALAR_FUN_ATTR double fdiv64(double x, double y) { + return x / y; +} + +SCALAR_FUN_ATTR double fadd64(double x, double y) { + return x + y; +} + +SCALAR_FUN_ATTR double fsub64(double x, double y) { + return x - y; +} + +SCALAR_FUN_ATTR double fmul64(double x, double y) { + return x * y; +} + +SCALAR_FUN_ATTR bool cmplt64(double x, double y) { + return x < y; +} + +SCALAR_FUN_ATTR bool cmple64(double x, double y) { + return x <= y; +} + +SCALAR_FUN_ATTR double sitofp_i8_f64(int8_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double sitofp_i16_f64(int16_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double sitofp_i32_f64(int32_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double sitofp_i64_f64(int64_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i8_f64(uint8_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i16_f64(uint16_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i32_f64(uint32_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double uitofp_i64_f64(uint64_t x) { + return (double) x; +} + +SCALAR_FUN_ATTR double fabs64(double x) { + return fabs(x); +} + +SCALAR_FUN_ATTR double fmax64(double x, double y) { + return fmax(x, y); +} + +SCALAR_FUN_ATTR double fmin64(double x, double y) { + return fmin(x, y); +} + +SCALAR_FUN_ATTR double fpow64(double x, double y) { + return pow(x, y); +} + +SCALAR_FUN_ATTR double futrts_log64(double x) { + return log(x); +} + +SCALAR_FUN_ATTR double futrts_log2_64(double x) { + return log2(x); +} + +SCALAR_FUN_ATTR double futrts_log10_64(double x) { + return log10(x); +} + +SCALAR_FUN_ATTR double futrts_log1p_64(double x) { + return log1p(x); +} + +SCALAR_FUN_ATTR double futrts_sqrt64(double x) { + return sqrt(x); +} + +SCALAR_FUN_ATTR double futrts_cbrt64(double x) { + return cbrt(x); +} + +SCALAR_FUN_ATTR double futrts_exp64(double x) { + return exp(x); +} + +SCALAR_FUN_ATTR double futrts_cos64(double x) { + return cos(x); +} + +SCALAR_FUN_ATTR double futrts_sin64(double x) { + return sin(x); +} + +SCALAR_FUN_ATTR double futrts_tan64(double x) { + return tan(x); +} + +SCALAR_FUN_ATTR double futrts_acos64(double x) { + return acos(x); +} + +SCALAR_FUN_ATTR double futrts_asin64(double x) { + return asin(x); +} + +SCALAR_FUN_ATTR double futrts_atan64(double x) { + return atan(x); +} + +SCALAR_FUN_ATTR double futrts_cosh64(double x) { + return cosh(x); +} + +SCALAR_FUN_ATTR double futrts_sinh64(double x) { + return sinh(x); +} + +SCALAR_FUN_ATTR double futrts_tanh64(double x) { + return tanh(x); +} + +SCALAR_FUN_ATTR double futrts_acosh64(double x) { + return acosh(x); +} + +SCALAR_FUN_ATTR double futrts_asinh64(double x) { + return asinh(x); +} + +SCALAR_FUN_ATTR double futrts_atanh64(double x) { + return atanh(x); +} + +SCALAR_FUN_ATTR double futrts_atan2_64(double x, double y) { + return atan2(x, y); +} + +SCALAR_FUN_ATTR double futrts_hypot64(double x, double y) { + return hypot(x, y); +} + +SCALAR_FUN_ATTR double futrts_gamma64(double x) { + return tgamma(x); +} + +SCALAR_FUN_ATTR double futrts_lgamma64(double x) { + return lgamma(x); +} + +SCALAR_FUN_ATTR double futrts_erf64(double x) { + return erf(x); +} + +SCALAR_FUN_ATTR double futrts_erfc64(double x) { + return erfc(x); +} + +SCALAR_FUN_ATTR double futrts_fma64(double a, double b, double c) { + return fma(a, b, c); +} + +SCALAR_FUN_ATTR double futrts_round64(double x) { + return rint(x); +} + +SCALAR_FUN_ATTR double futrts_ceil64(double x) { + return ceil(x); +} + +SCALAR_FUN_ATTR float futrts_nextafter64(float x, float y) { + return nextafter(x, y); +} + +SCALAR_FUN_ATTR double futrts_floor64(double x) { + return floor(x); +} + +SCALAR_FUN_ATTR bool futrts_isnan64(double x) { + return isnan(x); +} + +SCALAR_FUN_ATTR bool futrts_isinf64(double x) { + return isinf(x); +} + +SCALAR_FUN_ATTR int8_t fptosi_f64_i8(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int8_t) x; + } +} + +SCALAR_FUN_ATTR int16_t fptosi_f64_i16(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int16_t) x; + } +} + +SCALAR_FUN_ATTR int32_t fptosi_f64_i32(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int32_t) x; + } +} + +SCALAR_FUN_ATTR int64_t fptosi_f64_i64(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (int64_t) x; + } +} + +SCALAR_FUN_ATTR uint8_t fptoui_f64_i8(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint8_t) (int8_t) x; + } +} + +SCALAR_FUN_ATTR uint16_t fptoui_f64_i16(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint16_t) (int16_t) x; + } +} + +SCALAR_FUN_ATTR uint32_t fptoui_f64_i32(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint32_t) (int32_t) x; + } +} + +SCALAR_FUN_ATTR uint64_t fptoui_f64_i64(double x) { + if (futrts_isnan64(x) || futrts_isinf64(x)) { + return 0; + } else { + return (uint64_t) (int64_t) x; + } +} + +SCALAR_FUN_ATTR bool ftob_f64_bool(double x) { + return x != 0; +} + +SCALAR_FUN_ATTR double btof_bool_f64(bool x) { + return x ? 1 : 0; +} + +SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x) { + union { + double f; + int64_t t; + } p; + + p.f = x; + return p.t; +} + +SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x) { + union { + int64_t f; + double t; + } p; + + p.f = x; + return p.t; +} + +SCALAR_FUN_ATTR double fmod64(double x, double y) { + return fmod(x, y); +} + +SCALAR_FUN_ATTR double fsignum64(double x) { + return futrts_isnan64(x) ? x : (x > 0) - (x < 0); +} + +SCALAR_FUN_ATTR double futrts_lerp64(double v0, double v1, double t) { +#ifdef __OPENCL_VERSION__ + return mix(v0, v1, t); +#else + return v0 + (v1 - v0) * t; +#endif +} + +SCALAR_FUN_ATTR double futrts_ldexp64(double x, int32_t y) { + return ldexp(x, y); +} + +SCALAR_FUN_ATTR float futrts_copysign64(double x, double y) { + return copysign(x, y); +} + +SCALAR_FUN_ATTR double futrts_mad64(double a, double b, double c) { +#ifdef __OPENCL_VERSION__ + return mad(a, b, c); +#else + return a * b + c; +#endif +} + +SCALAR_FUN_ATTR float fpconv_f32_f32(float x) { + return (float) x; +} + +SCALAR_FUN_ATTR double fpconv_f32_f64(float x) { + return (double) x; +} + +SCALAR_FUN_ATTR float fpconv_f64_f32(double x) { + return (float) x; +} + +SCALAR_FUN_ATTR double fpconv_f64_f64(double x) { + return (double) x; +} + +#endif + +#endif + +// End of scalar.h. +// Start of scalar_f16.h. + +// Half-precision is emulated if needed (e.g. in straight C) with the +// native type used if possible. The emulation works by typedef'ing +// 'float' to 'f16', and then implementing all operations on single +// precision. To cut down on duplication, we use the same code for +// those Futhark functions that require just operators or casts. The +// in-memory representation for arrays will still be 16 bits even +// under emulation, so the compiler will have to be careful when +// generating reads or writes. + +#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC)) +#define EMULATE_F16 +#endif + +#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif + +#ifdef EMULATE_F16 + +// Note that the half-precision storage format is still 16 bits - the +// compiler will have to be real careful! +typedef float f16; + +#elif ISPC +typedef float16 f16; + +#else + +#ifdef __CUDA_ARCH__ +#include +#endif + +typedef half f16; + +#endif + +// Some of these functions convert to single precision because half +// precision versions are not available. + +SCALAR_FUN_ATTR f16 fadd16(f16 x, f16 y) { + return x + y; +} + +SCALAR_FUN_ATTR f16 fsub16(f16 x, f16 y) { + return x - y; +} + +SCALAR_FUN_ATTR f16 fmul16(f16 x, f16 y) { + return x * y; +} + +SCALAR_FUN_ATTR bool cmplt16(f16 x, f16 y) { + return x < y; +} + +SCALAR_FUN_ATTR bool cmple16(f16 x, f16 y) { + return x <= y; +} + +SCALAR_FUN_ATTR f16 sitofp_i8_f16(int8_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR f16 sitofp_i16_f16(int16_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR f16 sitofp_i32_f16(int32_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR f16 sitofp_i64_f16(int64_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR f16 uitofp_i8_f16(uint8_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR f16 uitofp_i16_f16(uint16_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR f16 uitofp_i32_f16(uint32_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR f16 uitofp_i64_f16(uint64_t x) { + return (f16) x; +} + +SCALAR_FUN_ATTR int8_t fptosi_f16_i8(f16 x) { + return (int8_t) (float) x; +} + +SCALAR_FUN_ATTR int16_t fptosi_f16_i16(f16 x) { + return (int16_t) x; +} + +SCALAR_FUN_ATTR int32_t fptosi_f16_i32(f16 x) { + return (int32_t) x; +} + +SCALAR_FUN_ATTR int64_t fptosi_f16_i64(f16 x) { + return (int64_t) x; +} + +SCALAR_FUN_ATTR uint8_t fptoui_f16_i8(f16 x) { + return (uint8_t) (float) x; +} + +SCALAR_FUN_ATTR uint16_t fptoui_f16_i16(f16 x) { + return (uint16_t) x; +} + +SCALAR_FUN_ATTR uint32_t fptoui_f16_i32(f16 x) { + return (uint32_t) x; +} + +SCALAR_FUN_ATTR uint64_t fptoui_f16_i64(f16 x) { + return (uint64_t) x; +} + +SCALAR_FUN_ATTR bool ftob_f16_bool(f16 x) { + return x != (f16)0; +} + +SCALAR_FUN_ATTR f16 btof_bool_f16(bool x) { + return x ? 1 : 0; +} + +#ifndef EMULATE_F16 +SCALAR_FUN_ATTR bool futrts_isnan16(f16 x) { + return isnan((float)x); +} + +#ifdef __OPENCL_VERSION__ + +SCALAR_FUN_ATTR f16 fabs16(f16 x) { + return fabs(x); +} + +SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) { + return fmax(x, y); +} + +SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) { + return fmin(x, y); +} + +SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) { + return pow(x, y); +} + +#elif ISPC +SCALAR_FUN_ATTR f16 fabs16(f16 x) { + return abs(x); +} + +SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) { + return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y); +} + +SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) { + return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y); +} + +SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) { + return pow(x, y); +} + +#else // Assuming CUDA. + +SCALAR_FUN_ATTR f16 fabs16(f16 x) { + return fabsf(x); +} + +SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) { + return fmaxf(x, y); +} + +SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) { + return fminf(x, y); +} + +SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) { + return powf(x, y); +} +#endif + +#if ISPC +SCALAR_FUN_ATTR bool futrts_isinf16(float x) { + return !futrts_isnan16(x) && futrts_isnan16(x - x); +} +SCALAR_FUN_ATTR bool futrts_isfinite16(float x) { + return !futrts_isnan16(x) && !futrts_isinf16(x); +} + +#else + +SCALAR_FUN_ATTR bool futrts_isinf16(f16 x) { + return isinf((float)x); +} +#endif + +#ifdef __OPENCL_VERSION__ +SCALAR_FUN_ATTR f16 futrts_log16(f16 x) { + return log(x); +} + +SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) { + return log2(x); +} + +SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) { + return log10(x); +} + +SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) { + return log1p(x); +} + +SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) { + return sqrt(x); +} + +SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) { + return cbrt(x); +} + +SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) { + return exp(x); +} + +SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) { + return cos(x); +} + +SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) { + return sin(x); +} + +SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) { + return tan(x); +} + +SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) { + return acos(x); +} + +SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) { + return asin(x); +} + +SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) { + return atan(x); +} + +SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) { + return cosh(x); +} + +SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) { + return sinh(x); +} + +SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) { + return tanh(x); +} + +SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) { + return acosh(x); +} + +SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) { + return asinh(x); +} + +SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) { + return atanh(x); +} + +SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) { + return atan2(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) { + return hypot(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) { + return tgamma(x); +} + +SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) { + return lgamma(x); +} + +SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) { + return erf(x); +} + +SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) { + return erfc(x); +} + +SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) { + return fmod(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_round16(f16 x) { + return rint(x); +} + +SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) { + return floor(x); +} + +SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) { + return ceil(x); +} + +SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) { + return nextafter(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { + return mix(v0, v1, t); +} + +SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) { + return ldexp(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) { + return copysign(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) { + return mad(a, b, c); +} + +SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) { + return fma(a, b, c); +} +#elif ISPC + +SCALAR_FUN_ATTR f16 futrts_log16(f16 x) { + return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x; +} + +SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) { + return futrts_log16(x) / log(2.0f16); +} + +SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) { + return futrts_log16(x) / log(10.0f16); +} + +SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) { + if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16; + f16 y = 1.0f16 + x; + f16 z = y - 1.0f16; + return log(y) - (z-x)/y; +} + +SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) { + return (float16)sqrt((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) { + return exp(x); +} + +SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) { + return (float16)cos((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) { + return (float16)sin((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) { + return (float16)tan((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) { + return (float16)acos((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) { + return (float16)asin((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) { + return (float16)atan((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) { + return (exp(x)+exp(-x)) / 2.0f16; +} + +SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) { + return (exp(x)-exp(-x)) / 2.0f16; +} + +SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) { + return futrts_sinh16(x)/futrts_cosh16(x); +} + +SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) { + float16 f = x+(float16)sqrt((float)(x*x-1)); + if(futrts_isfinite16(f)) return log(f); + return f; +} + +SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) { + float16 f = x+(float16)sqrt((float)(x*x+1)); + if(futrts_isfinite16(f)) return log(f); + return f; +} + +SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) { + float16 f = (1+x)/(1-x); + if(futrts_isfinite16(f)) return log(f)/2.0f16; + return f; +} + +SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) { + return (float16)atan2((float)x, (float)y); +} + +SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) { + return (float16)futrts_hypot32((float)x, (float)y); +} + +extern "C" unmasked uniform float tgammaf(uniform float x); +SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) { + f16 res; + foreach_active (i) { + uniform f16 r = (f16)tgammaf(extract((float)x, i)); + res = insert(res, i, r); + } + return res; +} + +extern "C" unmasked uniform float lgammaf(uniform float x); +SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) { + f16 res; + foreach_active (i) { + uniform f16 r = (f16)lgammaf(extract((float)x, i)); + res = insert(res, i, r); + } + return res; +} + +SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) { + f16 res = (f16)futrts_cbrt32((float)x); + return res; +} + +SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) { + f16 res = (f16)futrts_erf32((float)x); + return res; +} + +SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) { + f16 res = (f16)futrts_erfc32((float)x); + return res; +} + +SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) { + return x - y * (float16)trunc((float) (x/y)); +} + +SCALAR_FUN_ATTR f16 futrts_round16(f16 x) { + return (float16)round((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) { + return (float16)floor((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) { + return (float16)ceil((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) { + return (float16)futrts_nextafter32((float)x, (float) y); +} + +SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { + return v0 + (v1 - v0) * t; +} + +SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) { + return futrts_ldexp32((float)x, y); +} + +SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) { + return futrts_copysign32((float)x, y); +} + +SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) { + return a * b + c; +} + +SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) { + return a * b + c; +} + +#else // Assume CUDA. + +SCALAR_FUN_ATTR f16 futrts_log16(f16 x) { + return hlog(x); +} + +SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) { + return hlog2(x); +} + +SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) { + return hlog10(x); +} + +SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) { + return (f16)log1pf((float)x); +} + +SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) { + return hsqrt(x); +} + +SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) { + return cbrtf(x); +} + +SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) { + return hexp(x); +} + +SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) { + return hcos(x); +} + +SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) { + return hsin(x); +} + +SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) { + return tanf(x); +} + +SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) { + return acosf(x); +} + +SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) { + return asinf(x); +} + +SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) { + return atanf(x); +} + +SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) { + return coshf(x); +} + +SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) { + return sinhf(x); +} + +SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) { + return tanhf(x); +} + +SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) { + return acoshf(x); +} + +SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) { + return asinhf(x); +} + +SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) { + return atanhf(x); +} + +SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) { + return atan2f(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) { + return hypotf(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) { + return tgammaf(x); +} + +SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) { + return lgammaf(x); +} + +SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) { + return erff(x); +} + +SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) { + return erfcf(x); +} + +SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) { + return fmodf(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_round16(f16 x) { + return rintf(x); +} + +SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) { + return hfloor(x); +} + +SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) { + return hceil(x); +} + +SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) { + return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y))); +} + +SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { + return v0 + (v1 - v0) * t; +} + +SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) { + return futrts_ldexp32((float)x, y); +} + +SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) { + return futrts_copysign32((float)x, y); +} + +SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) { + return a * b + c; +} + +SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) { + return fmaf(a, b, c); +} + +#endif + +// The CUDA __half type cannot be put in unions for some reason, so we +// use bespoke conversion functions instead. +#ifdef __CUDA_ARCH__ +SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) { + return __half_as_ushort(x); +} +SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) { + return __ushort_as_half(x); +} +#elif ISPC + +SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) { + varying int16_t y = *((varying int16_t * uniform)&x); + return y; +} + +SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) { + varying f16 y = *((varying f16 * uniform)&x); + return y; +} +#else +SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) { + union { + f16 f; + int16_t t; + } p; + + p.f = x; + return p.t; +} + +SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) { + union { + int16_t f; + f16 t; + } p; + + p.f = x; + return p.t; +} +#endif + +#else // No native f16 - emulate. + +SCALAR_FUN_ATTR f16 fabs16(f16 x) { + return fabs32(x); +} + +SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) { + return fmax32(x, y); +} + +SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) { + return fmin32(x, y); +} + +SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) { + return fpow32(x, y); +} + +SCALAR_FUN_ATTR bool futrts_isnan16(f16 x) { + return futrts_isnan32(x); +} + +SCALAR_FUN_ATTR bool futrts_isinf16(f16 x) { + return futrts_isinf32(x); +} + +SCALAR_FUN_ATTR f16 futrts_log16(f16 x) { + return futrts_log32(x); +} + +SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) { + return futrts_log2_32(x); +} + +SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) { + return futrts_log10_32(x); +} + +SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) { + return futrts_log1p_32(x); +} + +SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) { + return futrts_sqrt32(x); +} + +SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) { + return futrts_cbrt32(x); +} + +SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) { + return futrts_exp32(x); +} + +SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) { + return futrts_cos32(x); +} + +SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) { + return futrts_sin32(x); +} + +SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) { + return futrts_tan32(x); +} + +SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) { + return futrts_acos32(x); +} + +SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) { + return futrts_asin32(x); +} + +SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) { + return futrts_atan32(x); +} + +SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) { + return futrts_cosh32(x); +} + +SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) { + return futrts_sinh32(x); +} + +SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) { + return futrts_tanh32(x); +} + +SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) { + return futrts_acosh32(x); +} + +SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) { + return futrts_asinh32(x); +} + +SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) { + return futrts_atanh32(x); +} + +SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) { + return futrts_atan2_32(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) { + return futrts_hypot32(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) { + return futrts_gamma32(x); +} + +SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) { + return futrts_lgamma32(x); +} + +SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) { + return futrts_erf32(x); +} + +SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) { + return futrts_erfc32(x); +} + +SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) { + return fmod32(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_round16(f16 x) { + return futrts_round32(x); +} + +SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) { + return futrts_floor32(x); +} + +SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) { + return futrts_ceil32(x); +} + +SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) { + return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y))); +} + +SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { + return futrts_lerp32(v0, v1, t); +} + +SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) { + return futrts_ldexp32(x, y); +} + +SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) { + return futrts_copysign32((float)x, y); +} + +SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) { + return futrts_mad32(a, b, c); +} + +SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) { + return futrts_fma32(a, b, c); +} + +// Even when we are using an OpenCL that does not support cl_khr_fp16, +// it must still support vload_half for actually creating a +// half-precision number, which can then be efficiently converted to a +// float. Similarly for vstore_half. +#ifdef __OPENCL_VERSION__ + +SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) { + int16_t y; + // Violating strict aliasing here. + vstore_half((float)x, 0, (half*)&y); + return y; +} + +SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) { + return (f16)vload_half(0, (half*)&x); +} + +#else + +SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) { + return (int16_t)float2halfbits(x); +} + +SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) { + return halfbits2float((uint16_t)x); +} + +SCALAR_FUN_ATTR f16 fsignum16(f16 x) { + return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0); +} + +#endif + +#endif + +SCALAR_FUN_ATTR float fpconv_f16_f16(f16 x) { + return x; +} + +SCALAR_FUN_ATTR float fpconv_f16_f32(f16 x) { + return x; +} + +SCALAR_FUN_ATTR f16 fpconv_f32_f16(float x) { + return (f16) x; +} + +#ifdef FUTHARK_F64_ENABLED + +SCALAR_FUN_ATTR double fpconv_f16_f64(f16 x) { + return (double) x; +} + +#if ISPC +SCALAR_FUN_ATTR f16 fpconv_f64_f16(double x) { + return (f16) ((float)x); +} +#else +SCALAR_FUN_ATTR f16 fpconv_f64_f16(double x) { + return (f16) x; +} +#endif +#endif + + +// End of scalar_f16.h. +// Start of atomics.h + +SCALAR_FUN_ATTR int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_xchg_i32_shared(volatile __local int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p, + int32_t cmp, int32_t val); +SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_shared(volatile __local int32_t *p, + int32_t cmp, int32_t val); +SCALAR_FUN_ATTR int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_add_i32_shared(volatile __local int32_t *p, int32_t x); +SCALAR_FUN_ATTR float atomic_fadd_f32_global(volatile __global float *p, float x); +SCALAR_FUN_ATTR float atomic_fadd_f32_shared(volatile __local float *p, float x); +SCALAR_FUN_ATTR int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_smax_i32_shared(volatile __local int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_smin_i32_shared(volatile __local int32_t *p, int32_t x); +SCALAR_FUN_ATTR uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x); +SCALAR_FUN_ATTR uint32_t atomic_umax_i32_shared(volatile __local uint32_t *p, uint32_t x); +SCALAR_FUN_ATTR uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x); +SCALAR_FUN_ATTR uint32_t atomic_umin_i32_shared(volatile __local uint32_t *p, uint32_t x); +SCALAR_FUN_ATTR int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_and_i32_shared(volatile __local int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_or_i32_shared(volatile __local int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_xor_i32_global(volatile __global int32_t *p, int32_t x); +SCALAR_FUN_ATTR int32_t atomic_xor_i32_shared(volatile __local int32_t *p, int32_t x); + +SCALAR_FUN_ATTR int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicExch((int32_t*)p, x); +#else + return atomic_xor(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_xchg_i32_shared(volatile __local int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicExch((int32_t*)p, x); +#else + return atomic_xor(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p, + int32_t cmp, int32_t val) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicCAS((int32_t*)p, cmp, val); +#else + return atomic_cmpxchg(p, cmp, val); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_shared(volatile __local int32_t *p, + int32_t cmp, int32_t val) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicCAS((int32_t*)p, cmp, val); +#else + return atomic_cmpxchg(p, cmp, val); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAdd((int32_t*)p, x); +#else + return atomic_add(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_add_i32_shared(volatile __local int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAdd((int32_t*)p, x); +#else + return atomic_add(p, x); +#endif +} + +SCALAR_FUN_ATTR float atomic_fadd_f32_global(volatile __global float *p, float x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAdd((float*)p, x); +#else + union { int32_t i; float f; } old; + union { int32_t i; float f; } assumed; + old.f = *p; + do { + assumed.f = old.f; + old.f = old.f + x; + old.i = atomic_cmpxchg_i32_global((volatile __global int32_t*)p, assumed.i, old.i); + } while (assumed.i != old.i); + return old.f; +#endif +} + +SCALAR_FUN_ATTR float atomic_fadd_f32_shared(volatile __local float *p, float x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAdd((float*)p, x); +#else + union { int32_t i; float f; } old; + union { int32_t i; float f; } assumed; + old.f = *p; + do { + assumed.f = old.f; + old.f = old.f + x; + old.i = atomic_cmpxchg_i32_shared((volatile __local int32_t*)p, assumed.i, old.i); + } while (assumed.i != old.i); + return old.f; +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMax((int32_t*)p, x); +#else + return atomic_max(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_smax_i32_shared(volatile __local int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMax((int32_t*)p, x); +#else + return atomic_max(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMin((int32_t*)p, x); +#else + return atomic_min(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_smin_i32_shared(volatile __local int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMin((int32_t*)p, x); +#else + return atomic_min(p, x); +#endif +} + +SCALAR_FUN_ATTR uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMax((uint32_t*)p, x); +#else + return atomic_max(p, x); +#endif +} + +SCALAR_FUN_ATTR uint32_t atomic_umax_i32_shared(volatile __local uint32_t *p, uint32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMax((uint32_t*)p, x); +#else + return atomic_max(p, x); +#endif +} + +SCALAR_FUN_ATTR uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMin((uint32_t*)p, x); +#else + return atomic_min(p, x); +#endif +} + +SCALAR_FUN_ATTR uint32_t atomic_umin_i32_shared(volatile __local uint32_t *p, uint32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMin((uint32_t*)p, x); +#else + return atomic_min(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAnd((int32_t*)p, x); +#else + return atomic_and(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_and_i32_shared(volatile __local int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAnd((int32_t*)p, x); +#else + return atomic_and(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicOr((int32_t*)p, x); +#else + return atomic_or(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_or_i32_shared(volatile __local int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicOr((int32_t*)p, x); +#else + return atomic_or(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_xor_i32_global(volatile __global int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicXor((int32_t*)p, x); +#else + return atomic_xor(p, x); +#endif +} + +SCALAR_FUN_ATTR int32_t atomic_xor_i32_shared(volatile __local int32_t *p, int32_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicXor((int32_t*)p, x); +#else + return atomic_xor(p, x); +#endif +} + +// Start of 64 bit atomics + +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) || defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) + +SCALAR_FUN_ATTR int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_xchg_i64_shared(volatile __local int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p, + int64_t cmp, int64_t val); +SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_shared(volatile __local int64_t *p, + int64_t cmp, int64_t val); +SCALAR_FUN_ATTR int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_add_i64_shared(volatile __local int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_smax_i64_shared(volatile __local int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_smin_i64_shared(volatile __local int64_t *p, int64_t x); +SCALAR_FUN_ATTR uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x); +SCALAR_FUN_ATTR uint64_t atomic_umax_i64_shared(volatile __local uint64_t *p, uint64_t x); +SCALAR_FUN_ATTR uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x); +SCALAR_FUN_ATTR uint64_t atomic_umin_i64_shared(volatile __local uint64_t *p, uint64_t x); +SCALAR_FUN_ATTR int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_and_i64_shared(volatile __local int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_or_i64_shared(volatile __local int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_xor_i64_global(volatile __global int64_t *p, int64_t x); +SCALAR_FUN_ATTR int64_t atomic_xor_i64_shared(volatile __local int64_t *p, int64_t x); + +#ifdef FUTHARK_F64_ENABLED +SCALAR_FUN_ATTR double atomic_fadd_f64_global(volatile __global double *p, double x); +SCALAR_FUN_ATTR double atomic_fadd_f64_shared(volatile __local double *p, double x); +#endif + +SCALAR_FUN_ATTR int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicExch((uint64_t*)p, x); +#else + return atom_xor(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_xchg_i64_shared(volatile __local int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicExch((uint64_t*)p, x); +#else + return atom_xor(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p, + int64_t cmp, int64_t val) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicCAS((uint64_t*)p, cmp, val); +#else + return atom_cmpxchg(p, cmp, val); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_shared(volatile __local int64_t *p, + int64_t cmp, int64_t val) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicCAS((uint64_t*)p, cmp, val); +#else + return atom_cmpxchg(p, cmp, val); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAdd((uint64_t*)p, x); +#else + return atom_add(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_add_i64_shared(volatile __local int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAdd((uint64_t*)p, x); +#else + return atom_add(p, x); +#endif +} + +#ifdef FUTHARK_F64_ENABLED + +SCALAR_FUN_ATTR double atomic_fadd_f64_global(volatile __global double *p, double x) { +#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600 || defined(FUTHARK_HIP) + return atomicAdd((double*)p, x); +#else + union { int64_t i; double f; } old; + union { int64_t i; double f; } assumed; + old.f = *p; + do { + assumed.f = old.f; + old.f = old.f + x; + old.i = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed.i, old.i); + } while (assumed.i != old.i); + return old.f; +#endif +} + +SCALAR_FUN_ATTR double atomic_fadd_f64_shared(volatile __local double *p, double x) { +#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600 || defined(FUTHARK_HIP) + return atomicAdd((double*)p, x); +#else + union { int64_t i; double f; } old; + union { int64_t i; double f; } assumed; + old.f = *p; + do { + assumed.f = old.f; + old.f = old.f + x; + old.i = atomic_cmpxchg_i64_shared((volatile __local int64_t*)p, assumed.i, old.i); + } while (assumed.i != old.i); + return old.f; +#endif +} + +#endif + +SCALAR_FUN_ATTR int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) + return atomicMax((int64_t*)p, x); +#elif defined(FUTHARK_HIP) + // Currentely missing in HIP; probably a temporary oversight. + int64_t old = *p, assumed; + do { + assumed = old; + old = smax64(old, x); + old = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed, old); + } while (assumed != old); + return old; +#else + return atom_max(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_smax_i64_shared(volatile __local int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) + return atomicMax((int64_t*)p, x); +#elif defined(FUTHARK_HIP) + // Currentely missing in HIP; probably a temporary oversight. + int64_t old = *p, assumed; + do { + assumed = old; + old = smax64(old, x); + old = atomic_cmpxchg_i64_shared((volatile __local int64_t*)p, assumed, old); + } while (assumed != old); + return old; +#else + return atom_max(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) + return atomicMin((int64_t*)p, x); +#elif defined(FUTHARK_HIP) + // Currentely missing in HIP; probably a temporary oversight. + int64_t old = *p, assumed; + do { + assumed = old; + old = smin64(old, x); + old = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed, old); + } while (assumed != old); + return old; +#else + return atom_min(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_smin_i64_shared(volatile __local int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) + return atomicMin((int64_t*)p, x); +#elif defined(FUTHARK_HIP) + // Currentely missing in HIP; probably a temporary oversight. + int64_t old = *p, assumed; + do { + assumed = old; + old = smin64(old, x); + old = atomic_cmpxchg_i64_shared((volatile __local int64_t*)p, assumed, old); + } while (assumed != old); + return old; +#else + return atom_min(p, x); +#endif +} + +SCALAR_FUN_ATTR uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMax((uint64_t*)p, x); +#else + return atom_max(p, x); +#endif +} + +SCALAR_FUN_ATTR uint64_t atomic_umax_i64_shared(volatile __local uint64_t *p, uint64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMax((uint64_t*)p, x); +#else + return atom_max(p, x); +#endif +} + +SCALAR_FUN_ATTR uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMin((uint64_t*)p, x); +#else + return atom_min(p, x); +#endif +} + +SCALAR_FUN_ATTR uint64_t atomic_umin_i64_shared(volatile __local uint64_t *p, uint64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicMin((uint64_t*)p, x); +#else + return atom_min(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAnd((uint64_t*)p, x); +#else + return atom_and(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_and_i64_shared(volatile __local int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicAnd((uint64_t*)p, x); +#else + return atom_and(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicOr((uint64_t*)p, x); +#else + return atom_or(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_or_i64_shared(volatile __local int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicOr((uint64_t*)p, x); +#else + return atom_or(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_xor_i64_global(volatile __global int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicXor((uint64_t*)p, x); +#else + return atom_xor(p, x); +#endif +} + +SCALAR_FUN_ATTR int64_t atomic_xor_i64_shared(volatile __local int64_t *p, int64_t x) { +#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) + return atomicXor((uint64_t*)p, x); +#else + return atom_xor(p, x); +#endif +} + +#endif // defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) || defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) + +// End of atomics.h +// Start of transpose.cl + +#define GEN_TRANSPOSE_KERNELS(NAME, ELEM_TYPE) \ +FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM*2, TR_TILE_DIM/TR_ELEMS_PER_THREAD, 1)\ +void map_transpose_##NAME(SHARED_MEM_PARAM \ + __global ELEM_TYPE *dst_mem, \ + int64_t dst_offset, \ + __global ELEM_TYPE *src_mem, \ + int64_t src_offset, \ + int32_t num_arrays, \ + int32_t x_elems, \ + int32_t y_elems, \ + int32_t mulx, \ + int32_t muly, \ + int32_t repeat_1, \ + int32_t repeat_2) { \ + (void)mulx; (void)muly; \ + __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem; \ + int tblock_id_0 = get_tblock_id(0); \ + int global_id_0 = get_global_id(0); \ + int tblock_id_1 = get_tblock_id(1); \ + int global_id_1 = get_global_id(1); \ + for (int i1 = 0; i1 <= repeat_1; i1++) { \ + int tblock_id_2 = get_tblock_id(2); \ + int global_id_2 = get_global_id(2); \ + for (int i2 = 0; i2 <= repeat_2; i2++) { \ + int32_t our_array_offset = tblock_id_2 * x_elems * y_elems; \ + int32_t odata_offset = dst_offset + our_array_offset; \ + int32_t idata_offset = src_offset + our_array_offset; \ + int32_t x_index = global_id_0; \ + int32_t y_index = tblock_id_1 * TR_TILE_DIM + get_local_id(1); \ + if (x_index < x_elems) { \ + for (int32_t j = 0; j < TR_ELEMS_PER_THREAD; j++) { \ + int32_t index_i = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * x_elems + x_index; \ + if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < y_elems) { \ + block[(get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * (TR_TILE_DIM+1) + \ + get_local_id(0)] = \ + src_mem[idata_offset + index_i]; \ + } \ + } \ + } \ + barrier_local(); \ + x_index = tblock_id_1 * TR_TILE_DIM + get_local_id(0); \ + y_index = tblock_id_0 * TR_TILE_DIM + get_local_id(1); \ + if (x_index < y_elems) { \ + for (int32_t j = 0; j < TR_ELEMS_PER_THREAD; j++) { \ + int32_t index_out = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * y_elems + x_index; \ + if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < x_elems) { \ + dst_mem[(odata_offset + index_out)] = \ + block[get_local_id(0) * (TR_TILE_DIM+1) + \ + get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)]; \ + } \ + } \ + } \ + tblock_id_2 += get_num_tblocks(2); \ + global_id_2 += get_global_size(2); \ + } \ + tblock_id_1 += get_num_tblocks(1); \ + global_id_1 += get_global_size(1); \ + } \ +} \ + \ +FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM, TR_BLOCK_DIM, 1) \ +void map_transpose_##NAME##_low_height(SHARED_MEM_PARAM \ + __global ELEM_TYPE *dst_mem, \ + int64_t dst_offset, \ + __global ELEM_TYPE *src_mem, \ + int64_t src_offset, \ + int32_t num_arrays, \ + int32_t x_elems, \ + int32_t y_elems, \ + int32_t mulx, \ + int32_t muly, \ + int32_t repeat_1, \ + int32_t repeat_2) { \ + __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem; \ + int tblock_id_0 = get_tblock_id(0); \ + int global_id_0 = get_global_id(0); \ + int tblock_id_1 = get_tblock_id(1); \ + int global_id_1 = get_global_id(1); \ + for (int i1 = 0; i1 <= repeat_1; i1++) { \ + int tblock_id_2 = get_tblock_id(2); \ + int global_id_2 = get_global_id(2); \ + for (int i2 = 0; i2 <= repeat_2; i2++) { \ + int32_t our_array_offset = tblock_id_2 * x_elems * y_elems; \ + int32_t odata_offset = dst_offset + our_array_offset; \ + int32_t idata_offset = src_offset + our_array_offset; \ + int32_t x_index = \ + tblock_id_0 * TR_BLOCK_DIM * mulx + \ + get_local_id(0) + \ + get_local_id(1)%mulx * TR_BLOCK_DIM; \ + int32_t y_index = tblock_id_1 * TR_BLOCK_DIM + get_local_id(1)/mulx; \ + int32_t index_in = y_index * x_elems + x_index; \ + if (x_index < x_elems && y_index < y_elems) { \ + block[get_local_id(1) * (TR_BLOCK_DIM+1) + get_local_id(0)] = \ + src_mem[idata_offset + index_in]; \ + } \ + barrier_local(); \ + x_index = tblock_id_1 * TR_BLOCK_DIM + get_local_id(0)/mulx; \ + y_index = \ + tblock_id_0 * TR_BLOCK_DIM * mulx + \ + get_local_id(1) + \ + (get_local_id(0)%mulx) * TR_BLOCK_DIM; \ + int32_t index_out = y_index * y_elems + x_index; \ + if (x_index < y_elems && y_index < x_elems) { \ + dst_mem[odata_offset + index_out] = \ + block[get_local_id(0) * (TR_BLOCK_DIM+1) + get_local_id(1)]; \ + } \ + tblock_id_2 += get_num_tblocks(2); \ + global_id_2 += get_global_size(2); \ + } \ + tblock_id_1 += get_num_tblocks(1); \ + global_id_1 += get_global_size(1); \ + } \ +} \ + \ +FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM, TR_BLOCK_DIM, 1) \ +void map_transpose_##NAME##_low_width(SHARED_MEM_PARAM \ + __global ELEM_TYPE *dst_mem, \ + int64_t dst_offset, \ + __global ELEM_TYPE *src_mem, \ + int64_t src_offset, \ + int32_t num_arrays, \ + int32_t x_elems, \ + int32_t y_elems, \ + int32_t mulx, \ + int32_t muly, \ + int32_t repeat_1, \ + int32_t repeat_2) { \ + __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem; \ + int tblock_id_0 = get_tblock_id(0); \ + int global_id_0 = get_global_id(0); \ + int tblock_id_1 = get_tblock_id(1); \ + int global_id_1 = get_global_id(1); \ + for (int i1 = 0; i1 <= repeat_1; i1++) { \ + int tblock_id_2 = get_tblock_id(2); \ + int global_id_2 = get_global_id(2); \ + for (int i2 = 0; i2 <= repeat_2; i2++) { \ + int32_t our_array_offset = tblock_id_2 * x_elems * y_elems; \ + int32_t odata_offset = dst_offset + our_array_offset; \ + int32_t idata_offset = src_offset + our_array_offset; \ + int32_t x_index = tblock_id_0 * TR_BLOCK_DIM + get_local_id(0)/muly; \ + int32_t y_index = \ + tblock_id_1 * TR_BLOCK_DIM * muly + \ + get_local_id(1) + (get_local_id(0)%muly) * TR_BLOCK_DIM; \ + int32_t index_in = y_index * x_elems + x_index; \ + if (x_index < x_elems && y_index < y_elems) { \ + block[get_local_id(1) * (TR_BLOCK_DIM+1) + get_local_id(0)] = \ + src_mem[idata_offset + index_in]; \ + } \ + barrier_local(); \ + x_index = tblock_id_1 * TR_BLOCK_DIM * muly + \ + get_local_id(0) + (get_local_id(1)%muly) * TR_BLOCK_DIM; \ + y_index = tblock_id_0 * TR_BLOCK_DIM + get_local_id(1)/muly; \ + int32_t index_out = y_index * y_elems + x_index; \ + if (x_index < y_elems && y_index < x_elems) { \ + dst_mem[odata_offset + index_out] = \ + block[get_local_id(0) * (TR_BLOCK_DIM+1) + get_local_id(1)]; \ + } \ + tblock_id_2 += get_num_tblocks(2); \ + global_id_2 += get_num_tblocks(2) * get_local_size(2); \ + } \ + tblock_id_1 += get_num_tblocks(1); \ + global_id_1 += get_num_tblocks(1) * get_local_size(1); \ + } \ +} \ + \ +FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM*TR_BLOCK_DIM, 1, 1) \ +void map_transpose_##NAME##_small(SHARED_MEM_PARAM \ + __global ELEM_TYPE *dst_mem, \ + int64_t dst_offset, \ + __global ELEM_TYPE *src_mem, \ + int64_t src_offset, \ + int32_t num_arrays, \ + int32_t x_elems, \ + int32_t y_elems, \ + int32_t mulx, \ + int32_t muly, \ + int32_t repeat_1, \ + int32_t repeat_2) { \ + (void)mulx; (void)muly; \ + __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem; \ + int tblock_id_0 = get_tblock_id(0); \ + int global_id_0 = get_global_id(0); \ + int tblock_id_1 = get_tblock_id(1); \ + int global_id_1 = get_global_id(1); \ + for (int i1 = 0; i1 <= repeat_1; i1++) { \ + int tblock_id_2 = get_tblock_id(2); \ + int global_id_2 = get_global_id(2); \ + for (int i2 = 0; i2 <= repeat_2; i2++) { \ + int32_t our_array_offset = global_id_0/(y_elems * x_elems) * y_elems * x_elems; \ + int32_t x_index = (global_id_0 % (y_elems * x_elems))/y_elems; \ + int32_t y_index = global_id_0%y_elems; \ + int32_t odata_offset = dst_offset + our_array_offset; \ + int32_t idata_offset = src_offset + our_array_offset; \ + int32_t index_in = y_index * x_elems + x_index; \ + int32_t index_out = x_index * y_elems + y_index; \ + if (global_id_0 < x_elems * y_elems * num_arrays) { \ + dst_mem[odata_offset + index_out] = src_mem[idata_offset + index_in]; \ + } \ + tblock_id_2 += get_num_tblocks(2); \ + global_id_2 += get_global_size(2); \ + } \ + tblock_id_1 += get_num_tblocks(1); \ + global_id_1 += get_global_size(1); \ + } \ +} \ + \ +FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM*2, TR_TILE_DIM/TR_ELEMS_PER_THREAD, 1)\ +void map_transpose_##NAME##_large(SHARED_MEM_PARAM \ + __global ELEM_TYPE *dst_mem, \ + int64_t dst_offset, \ + __global ELEM_TYPE *src_mem, \ + int64_t src_offset, \ + int64_t num_arrays, \ + int64_t x_elems, \ + int64_t y_elems, \ + int64_t mulx, \ + int64_t muly, \ + int32_t repeat_1, \ + int32_t repeat_2) { \ + (void)mulx; (void)muly; \ + __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem; \ + int tblock_id_0 = get_tblock_id(0); \ + int global_id_0 = get_global_id(0); \ + int tblock_id_1 = get_tblock_id(1); \ + int global_id_1 = get_global_id(1); \ + for (int i1 = 0; i1 <= repeat_1; i1++) { \ + int tblock_id_2 = get_tblock_id(2); \ + int global_id_2 = get_global_id(2); \ + for (int i2 = 0; i2 <= repeat_2; i2++) { \ + int64_t our_array_offset = tblock_id_2 * x_elems * y_elems; \ + int64_t odata_offset = dst_offset + our_array_offset; \ + int64_t idata_offset = src_offset + our_array_offset; \ + int64_t x_index = global_id_0; \ + int64_t y_index = tblock_id_1 * TR_TILE_DIM + get_local_id(1); \ + if (x_index < x_elems) { \ + for (int64_t j = 0; j < TR_ELEMS_PER_THREAD; j++) { \ + int64_t index_i = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * x_elems + x_index; \ + if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < y_elems) { \ + block[(get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * (TR_TILE_DIM+1) + \ + get_local_id(0)] = \ + src_mem[idata_offset + index_i]; \ + } \ + } \ + } \ + barrier_local(); \ + x_index = tblock_id_1 * TR_TILE_DIM + get_local_id(0); \ + y_index = tblock_id_0 * TR_TILE_DIM + get_local_id(1); \ + if (x_index < y_elems) { \ + for (int64_t j = 0; j < TR_ELEMS_PER_THREAD; j++) { \ + int64_t index_out = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * y_elems + x_index; \ + if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < x_elems) { \ + dst_mem[(odata_offset + index_out)] = \ + block[get_local_id(0) * (TR_TILE_DIM+1) + \ + get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)]; \ + } \ + } \ + } \ + tblock_id_2 += get_num_tblocks(2); \ + global_id_2 += get_global_size(2); \ + } \ + tblock_id_1 += get_num_tblocks(1); \ + global_id_1 += get_global_size(1); \ + } \ +} \ + +GEN_TRANSPOSE_KERNELS(1b, uint8_t) +GEN_TRANSPOSE_KERNELS(2b, uint16_t) +GEN_TRANSPOSE_KERNELS(4b, uint32_t) +GEN_TRANSPOSE_KERNELS(8b, uint64_t) + +// End of transpose.cl +// Start of copy.cl + +#define GEN_COPY_KERNEL(NAME, ELEM_TYPE) \ +FUTHARK_KERNEL void lmad_copy_##NAME(SHARED_MEM_PARAM \ + __global ELEM_TYPE *dst_mem, \ + int64_t dst_offset, \ + __global ELEM_TYPE *src_mem, \ + int64_t src_offset, \ + int64_t n, \ + int r, \ + int64_t shape0, int64_t dst_stride0, int64_t src_stride0, \ + int64_t shape1, int64_t dst_stride1, int64_t src_stride1, \ + int64_t shape2, int64_t dst_stride2, int64_t src_stride2, \ + int64_t shape3, int64_t dst_stride3, int64_t src_stride3, \ + int64_t shape4, int64_t dst_stride4, int64_t src_stride4, \ + int64_t shape5, int64_t dst_stride5, int64_t src_stride5, \ + int64_t shape6, int64_t dst_stride6, int64_t src_stride6, \ + int64_t shape7, int64_t dst_stride7, int64_t src_stride7) { \ + int64_t gtid = get_global_id(0); \ + int64_t remainder = gtid; \ + \ + if (gtid >= n) { \ + return; \ + } \ + \ + if (r > 0) { \ + int64_t i = remainder % shape0; \ + dst_offset += i * dst_stride0; \ + src_offset += i * src_stride0; \ + remainder /= shape0; \ + } \ + if (r > 1) { \ + int64_t i = remainder % shape1; \ + dst_offset += i * dst_stride1; \ + src_offset += i * src_stride1; \ + remainder /= shape1; \ + } \ + if (r > 2) { \ + int64_t i = remainder % shape2; \ + dst_offset += i * dst_stride2; \ + src_offset += i * src_stride2; \ + remainder /= shape2; \ + } \ + if (r > 3) { \ + int64_t i = remainder % shape3; \ + dst_offset += i * dst_stride3; \ + src_offset += i * src_stride3; \ + remainder /= shape3; \ + } \ + if (r > 4) { \ + int64_t i = remainder % shape4; \ + dst_offset += i * dst_stride4; \ + src_offset += i * src_stride4; \ + remainder /= shape4; \ + } \ + if (r > 5) { \ + int64_t i = remainder % shape5; \ + dst_offset += i * dst_stride5; \ + src_offset += i * src_stride5; \ + remainder /= shape5; \ + } \ + if (r > 6) { \ + int64_t i = remainder % shape6; \ + dst_offset += i * dst_stride6; \ + src_offset += i * src_stride6; \ + remainder /= shape6; \ + } \ + if (r > 7) { \ + int64_t i = remainder % shape7; \ + dst_offset += i * dst_stride7; \ + src_offset += i * src_stride7; \ + remainder /= shape7; \ + } \ + \ + dst_mem[dst_offset] = src_mem[src_offset]; \ +} + +GEN_COPY_KERNEL(1b, uint8_t) +GEN_COPY_KERNEL(2b, uint16_t) +GEN_COPY_KERNEL(4b, uint32_t) +GEN_COPY_KERNEL(8b, uint64_t) + +// End of copy.cl + + + +FUTHARK_KERNEL +void builtinzhreplicate_i32zireplicate_6875(__local uint64_t *shared_mem_aligned, int64_t num_elems_6871, int32_t val_6872, int64_t replicate_n_6874, int64_t virt_num_tblocks_6880, int64_t num_tblocks_6881, __global unsigned char *mem_6870) +{ + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + int32_t replicate_ltid_6876; + int32_t tblock_sizze_6878; + int32_t replicate_gid_6877; + int32_t replicate_gtid_6875; + int32_t phys_tblock_id_6882; + int32_t iterations_6883; + + replicate_ltid_6876 = get_local_id(0); + tblock_sizze_6878 = get_local_size(0); + replicate_gid_6877 = get_tblock_id(0); + replicate_gtid_6875 = replicate_gid_6877 * tblock_sizze_6878 + replicate_ltid_6876; + phys_tblock_id_6882 = get_tblock_id(0); + iterations_6883 = sdiv_up32(sext_i64_i32(virt_num_tblocks_6880) - phys_tblock_id_6882, sext_i64_i32(num_tblocks_6881)); + for (int32_t i_6884 = 0; i_6884 < iterations_6883; i_6884++) { + int32_t virt_tblock_id_6885; + int64_t global_tid_6886; + int64_t slice_6888; + int64_t rep_i_6887; + int64_t remnant_6889; + + virt_tblock_id_6885 = phys_tblock_id_6882 + i_6884 * sext_i64_i32(num_tblocks_6881); + global_tid_6886 = sext_i32_i64(virt_tblock_id_6885) * sext_i32_i64(tblock_sizze_6878) + sext_i32_i64(replicate_ltid_6876); + slice_6888 = num_elems_6871; + rep_i_6887 = global_tid_6886; + remnant_6889 = global_tid_6886 - rep_i_6887; + if (slt64(global_tid_6886, replicate_n_6874)) { + ((__global int32_t *) mem_6870)[rep_i_6887] = val_6872; + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_1: + return; +} +FUTHARK_KERNEL +void builtinzhreplicate_i64zireplicate_6799(__local uint64_t *shared_mem_aligned, int64_t num_elems_6795, int64_t val_6796, int64_t replicate_n_6798, int64_t virt_num_tblocks_6804, int64_t num_tblocks_6805, __global unsigned char *mem_6794) +{ + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + int32_t replicate_ltid_6800; + int32_t tblock_sizze_6802; + int32_t replicate_gid_6801; + int32_t replicate_gtid_6799; + int32_t phys_tblock_id_6806; + int32_t iterations_6807; + + replicate_ltid_6800 = get_local_id(0); + tblock_sizze_6802 = get_local_size(0); + replicate_gid_6801 = get_tblock_id(0); + replicate_gtid_6799 = replicate_gid_6801 * tblock_sizze_6802 + replicate_ltid_6800; + phys_tblock_id_6806 = get_tblock_id(0); + iterations_6807 = sdiv_up32(sext_i64_i32(virt_num_tblocks_6804) - phys_tblock_id_6806, sext_i64_i32(num_tblocks_6805)); + for (int32_t i_6808 = 0; i_6808 < iterations_6807; i_6808++) { + int32_t virt_tblock_id_6809; + int64_t global_tid_6810; + int64_t slice_6812; + int64_t rep_i_6811; + int64_t remnant_6813; + + virt_tblock_id_6809 = phys_tblock_id_6806 + i_6808 * sext_i64_i32(num_tblocks_6805); + global_tid_6810 = sext_i32_i64(virt_tblock_id_6809) * sext_i32_i64(tblock_sizze_6802) + sext_i32_i64(replicate_ltid_6800); + slice_6812 = num_elems_6795; + rep_i_6811 = global_tid_6810; + remnant_6813 = global_tid_6810 - rep_i_6811; + if (slt64(global_tid_6810, replicate_n_6798)) { + ((__global int64_t *) mem_6794)[rep_i_6811] = val_6796; + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_1: + return; +} +FUTHARK_KERNEL_SIZED(byte_histogramziseghist_global_6312_dim1, 1, 1) +void byte_histogramziseghist_global_6312(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5765, int64_t num_tblocks_6307, int64_t num_subhistos_6815, int32_t chk_i_6885, int64_t hist_H_chk_6886, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816) +{ + #define seghist_tblock_sizze_6305 (byte_histogramziseghist_global_6312ziseghist_tblock_sizze_6305) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + + if (*global_failure >= 0) + return; + + int32_t local_tid_6888; + int32_t tblock_sizze_6891; + int32_t wave_sizze_6890; + int32_t block_id_6889; + int32_t global_tid_6887; + int64_t phys_tid_6312; + int32_t subhisto_ind_6892; + int64_t num_chunks_6893; + + local_tid_6888 = get_local_id(0); + tblock_sizze_6891 = get_local_size(0); + wave_sizze_6890 = LOCKSTEP_WIDTH; + block_id_6889 = get_tblock_id(0); + global_tid_6887 = block_id_6889 * tblock_sizze_6891 + local_tid_6888; + phys_tid_6312 = sext_i32_i64(global_tid_6887); + subhisto_ind_6892 = squot32(global_tid_6887, sdiv_up32(sext_i64_i32(seghist_tblock_sizze_6305 * num_tblocks_6307), sext_i64_i32(num_subhistos_6815))); + num_chunks_6893 = sdiv_up64(n_5765, sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6305 * num_tblocks_6307))); + for (int64_t chunk_i_6894 = 0; chunk_i_6894 < num_chunks_6893; chunk_i_6894++) { + int64_t i_6895 = chunk_i_6894 * sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6305 * num_tblocks_6307)) + sext_i32_i64(global_tid_6887); + + if (slt64(i_6895, n_5765)) { + int64_t slice_6896; + int64_t gtid_6311; + int64_t remnant_6897; + + slice_6896 = n_5765; + gtid_6311 = i_6895; + remnant_6897 = i_6895 - gtid_6311; + if (slt64(i_6895, n_5765)) { + int8_t eta_p_6316; + int64_t u8_res_6318; + + eta_p_6316 = ((__global int8_t *) xs_mem_6757)[gtid_6311]; + u8_res_6318 = zext_i8_i64(eta_p_6316); + // save map-out results + { } + // perform atomic updates + { + if (sle64(sext_i32_i64(chk_i_6885) * hist_H_chk_6886, u8_res_6318) && (slt64(u8_res_6318, sext_i32_i64(chk_i_6885) * hist_H_chk_6886 + hist_H_chk_6886) && (sle64((int64_t) 0, u8_res_6318) && slt64(u8_res_6318, (int64_t) 256)))) { + int64_t eta_p_6313; + int64_t eta_p_6314 = (int64_t) 1; + int64_t old_6898; + + old_6898 = atomic_add_i64_global(&((volatile __global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(subhisto_ind_6892) * (int64_t) 256 + u8_res_6318], (int64_t) eta_p_6314); + } + } + } + } + } + + error_0: + return; + #undef seghist_tblock_sizze_6305 +} +FUTHARK_KERNEL_SIZED(byte_histogramziseghist_local_6312_dim1, 1, 1) +void byte_histogramziseghist_local_6312(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5765, int64_t num_subhistos_6815, int64_t num_tblocks_6826, int32_t hist_M_6832, int32_t chk_i_6836, int64_t num_segments_6837, int64_t hist_H_chk_6838, int64_t histo_sizze_6839, int32_t init_per_thread_6840, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816) +{ + #define max_tblock_sizze_6825 (byte_histogramziseghist_local_6312zimax_tblock_sizze_6825) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *subhistogram_local_mem_6854_backing_0 = &shared_mem[0]; + const int64_t subhistogram_local_mem_6854_backing_0_offset = 0 + ((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838) + srem64((int64_t) 8 - srem64((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838), (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6842; + int32_t tblock_sizze_6845; + int32_t wave_sizze_6844; + int32_t block_id_6843; + int32_t global_tid_6841; + int64_t phys_tid_6312; + int32_t phys_tblock_id_6846; + int32_t iterations_6847; + + local_tid_6842 = get_local_id(0); + tblock_sizze_6845 = get_local_size(0); + wave_sizze_6844 = LOCKSTEP_WIDTH; + block_id_6843 = get_tblock_id(0); + global_tid_6841 = block_id_6843 * tblock_sizze_6845 + local_tid_6842; + phys_tid_6312 = sext_i32_i64(global_tid_6841); + phys_tblock_id_6846 = get_tblock_id(0); + iterations_6847 = sdiv_up32(sext_i64_i32(num_tblocks_6826 * num_segments_6837) - phys_tblock_id_6846, sext_i64_i32(num_tblocks_6826)); + for (int32_t i_6848 = 0; i_6848 < iterations_6847; i_6848++) { + int32_t virt_tblock_id_6849; + int32_t flat_segment_id_6850; + int32_t gid_in_segment_6851; + int32_t pgtid_in_segment_6852; + int32_t threads_per_segment_6853; + __local unsigned char *subhistogram_local_mem_6854; + int32_t thread_local_subhisto_i_6856; + int64_t num_chunks_6863; + + virt_tblock_id_6849 = phys_tblock_id_6846 + i_6848 * sext_i64_i32(num_tblocks_6826); + flat_segment_id_6850 = squot32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826)); + gid_in_segment_6851 = srem32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826)); + pgtid_in_segment_6852 = gid_in_segment_6851 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842; + threads_per_segment_6853 = sext_i64_i32(num_tblocks_6826 * max_tblock_sizze_6825); + subhistogram_local_mem_6854 = (__local unsigned char *) subhistogram_local_mem_6854_backing_0; + thread_local_subhisto_i_6856 = srem32(local_tid_6842, hist_M_6832); + // initialize histograms in shared memory + { + for (int32_t local_i_6857 = 0; local_i_6857 < init_per_thread_6840; local_i_6857++) { + int32_t j_6858 = local_i_6857 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842; + int32_t j_offset_6859 = hist_M_6832 * sext_i64_i32(histo_sizze_6839) * gid_in_segment_6851 + j_6858; + int32_t local_subhisto_i_6860 = squot32(j_6858, sext_i64_i32(histo_sizze_6839)); + int32_t global_subhisto_i_6861 = squot32(j_offset_6859, sext_i64_i32(histo_sizze_6839)); + + if (slt32(j_6858, hist_M_6832 * sext_i64_i32(histo_sizze_6839))) { + // First subhistogram is initialised from global memory; others with neutral element. + { + if (global_subhisto_i_6861 == 0 && ((sle64((int64_t) 0, (int64_t) 0) && slt64((int64_t) 0, num_subhistos_6815)) && (sle64((int64_t) 0, sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838) && slt64(sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838, (int64_t) 256)))) { + int64_t tmp_6862 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838]; + + ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = tmp_6862; + } else { + ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = (int64_t) 0; + } + } + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + num_chunks_6863 = sdiv_up64(n_5765, sext_i32_i64(threads_per_segment_6853)); + for (int64_t chunk_i_6864 = 0; chunk_i_6864 < num_chunks_6863; chunk_i_6864++) { + int64_t i_6865 = chunk_i_6864 * sext_i32_i64(threads_per_segment_6853) + sext_i32_i64(pgtid_in_segment_6852); + + if (slt64(i_6865, n_5765)) { + int64_t gtid_6311; + int8_t eta_p_6316; + int64_t u8_res_6318; + + gtid_6311 = i_6865; + eta_p_6316 = ((__global int8_t *) xs_mem_6757)[gtid_6311]; + u8_res_6318 = zext_i8_i64(eta_p_6316); + if (chk_i_6836 == 0) { + // save map-out results + { } + } + // perform atomic updates + { + if ((sle64((int64_t) 0, u8_res_6318) && slt64(u8_res_6318, (int64_t) 256)) && (sle64(sext_i32_i64(chk_i_6836) * hist_H_chk_6838, u8_res_6318) && slt64(u8_res_6318, sext_i32_i64(chk_i_6836) * hist_H_chk_6838 + hist_H_chk_6838))) { + int64_t eta_p_6313; + int64_t eta_p_6314 = (int64_t) 1; + int64_t old_6866; + + old_6866 = atomic_add_i64_shared(&((volatile __local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(thread_local_subhisto_i_6856) * hist_H_chk_6838 + (u8_res_6318 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838)], (int64_t) eta_p_6314); + } + } + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + // Compact the multiple shared memory subhistograms to result in global memory + { + int64_t trunc_H_6867 = smin64(hist_H_chk_6838, (int64_t) 256 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838); + int32_t histo_sizze_6868 = sext_i64_i32(trunc_H_6867); + + for (int32_t local_i_6869 = 0; local_i_6869 < init_per_thread_6840; local_i_6869++) { + int32_t j_6870 = local_i_6869 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842; + + if (slt32(j_6870, histo_sizze_6868)) { + int64_t eta_p_6313; + int64_t eta_p_6314; + + // Read values from subhistogram 0. + { + eta_p_6313 = ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(j_6870)]; + } + // Accumulate based on values in other subhistograms. + { + for (int32_t subhisto_id_6871 = 0; subhisto_id_6871 < hist_M_6832 - 1; subhisto_id_6871++) { + eta_p_6314 = ((__local int64_t *) subhistogram_local_mem_6854)[(sext_i32_i64(subhisto_id_6871) + (int64_t) 1) * hist_H_chk_6838 + sext_i32_i64(j_6870)]; + + int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314); + + eta_p_6313 = defunc_0_op_res_6315; + } + } + // Put final bucket value in global memory. + { + ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[srem64(sext_i32_i64(virt_tblock_id_6849), num_tblocks_6826) * (int64_t) 256 + (sext_i32_i64(j_6870) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838)] = eta_p_6313; + } + } + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_2: + return; + #undef max_tblock_sizze_6825 +} +FUTHARK_KERNEL_SIZED(byte_histogramzisegred_large_6901_dim1, 1, 1) +void byte_histogramzisegred_large_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6307, int64_t num_subhistos_6815, int64_t blocks_per_segment_6932, int64_t q_6933, int64_t num_virtblocks_6934, int64_t threads_per_segment_6935, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816, __global unsigned char *segred_tmp_mem_6936, __global unsigned char *counters_mem_6938) +{ + #define seghist_tblock_sizze_6305 (byte_histogramzisegred_large_6901ziseghist_tblock_sizze_6305) + #define chunk_sizze_6902 (byte_histogramzisegred_large_6901zichunk_sizze_6902) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *sync_arr_mem_6967_backing_1 = &shared_mem[0]; + const int64_t sync_arr_mem_6967_backing_1_offset = 0 + 8; + volatile __local unsigned char *red_arr_i64_mem_6965_backing_0 = &shared_mem[sync_arr_mem_6967_backing_1_offset]; + const int64_t red_arr_i64_mem_6965_backing_0_offset = sync_arr_mem_6967_backing_1_offset + ((int64_t) 8 * seghist_tblock_sizze_6305 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6305, (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6961; + int32_t tblock_sizze_6964; + int32_t wave_sizze_6963; + int32_t block_id_6962; + int32_t global_tid_6960; + int64_t flat_gtid_6901; + __local unsigned char *red_arr_i64_mem_6965; + __local unsigned char *sync_arr_mem_6967; + int32_t phys_tblock_id_6969; + int32_t iterations_6970; + + local_tid_6961 = get_local_id(0); + tblock_sizze_6964 = get_local_size(0); + wave_sizze_6963 = LOCKSTEP_WIDTH; + block_id_6962 = get_tblock_id(0); + global_tid_6960 = block_id_6962 * tblock_sizze_6964 + local_tid_6961; + flat_gtid_6901 = sext_i32_i64(global_tid_6960); + red_arr_i64_mem_6965 = (__local unsigned char *) red_arr_i64_mem_6965_backing_0; + sync_arr_mem_6967 = (__local unsigned char *) sync_arr_mem_6967_backing_1; + phys_tblock_id_6969 = get_tblock_id(0); + iterations_6970 = sdiv_up32(sext_i64_i32(num_virtblocks_6934) - phys_tblock_id_6969, sext_i64_i32(num_tblocks_6307)); + for (int32_t i_6971 = 0; i_6971 < iterations_6970; i_6971++) { + int32_t virt_tblock_id_6972; + int64_t flat_segment_id_6973; + int64_t global_tid_6974; + int64_t slice_6975; + int64_t bucket_id_6899; + int64_t remnant_6976; + int64_t subhistogram_id_6900; + int64_t eta_p_block_res_acc_6977; + int64_t eta_p_6313; + int64_t eta_p_6314; + int64_t tblock_id_in_segment_6981; + int64_t block_base_offset_6982; + int32_t offset_6985; + int32_t skip_waves_6986; + int64_t eta_p_6978; + int64_t eta_p_6979; + + virt_tblock_id_6972 = phys_tblock_id_6969 + i_6971 * sext_i64_i32(num_tblocks_6307); + flat_segment_id_6973 = squot64(sext_i32_i64(virt_tblock_id_6972), blocks_per_segment_6932); + global_tid_6974 = srem64(sext_i32_i64(virt_tblock_id_6972) * seghist_tblock_sizze_6305 + sext_i32_i64(local_tid_6961), threads_per_segment_6935); + slice_6975 = (int64_t) 256; + bucket_id_6899 = flat_segment_id_6973; + remnant_6976 = flat_segment_id_6973 - bucket_id_6899; + // ne-initialise the outer (per-block) accumulator(s) + { + eta_p_block_res_acc_6977 = (int64_t) 0; + } + tblock_id_in_segment_6981 = squot64(global_tid_6974, seghist_tblock_sizze_6305); + block_base_offset_6982 = tblock_id_in_segment_6981 * q_6933 * seghist_tblock_sizze_6305; + for (int64_t i_6983 = 0; i_6983 < q_6933; i_6983++) { + int64_t block_offset_6984 = block_base_offset_6982 + i_6983 * seghist_tblock_sizze_6305; + + subhistogram_id_6900 = global_tid_6974 + threads_per_segment_6935 * i_6983; + if (slt64(subhistogram_id_6900, num_subhistos_6815)) { + // apply map function(s) + { + // load accumulator(s) + { + eta_p_6313 = eta_p_block_res_acc_6977; + } + // load next value(s) + { + eta_p_6314 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899]; + } + // apply reduction operator(s) + { + int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314); + + // store in accumulator(s) + { + eta_p_block_res_acc_6977 = defunc_0_op_res_6315; + } + } + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // store accs. prims go in lmem; non-prims in params (in global mem) + { + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_block_res_acc_6977; + } + barrier(CLK_LOCAL_MEM_FENCE); + skip_waves_6986 = 1; + offset_6985 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6305))) { + eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)]; + } + } + offset_6985 = 1; + while (slt32(offset_6985, wave_sizze_6963)) { + if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6985 - 1)) == 0) { + // read array element + { + eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + offset_6985 *= 2; + } + while (slt32(skip_waves_6986, squot32(sext_i64_i32(seghist_tblock_sizze_6305) + wave_sizze_6963 - 1, wave_sizze_6963))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_6985 = skip_waves_6986 * wave_sizze_6963; + if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6986 - 1)) == 0)) { + // read array element + { + eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + skip_waves_6986 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); + // thread 0 updates per-block acc(s); rest reset to ne + { + if (sext_i32_i64(local_tid_6961) == (int64_t) 0) { + eta_p_block_res_acc_6977 = eta_p_6978; + } else { + eta_p_block_res_acc_6977 = (int64_t) 0; + } + } + if (blocks_per_segment_6932 == (int64_t) 1) { + // first thread in block saves final result to memory + { + if (local_tid_6961 == 0) { + ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_block_res_acc_6977; + } + } + } else { + int32_t old_counter_6987; + bool is_last_block_6988; + + // first thread in block saves block result to global memory + { + if (local_tid_6961 == 0) { + ((__global int64_t *) segred_tmp_mem_6936)[sext_i32_i64(virt_tblock_id_6972)] = eta_p_block_res_acc_6977; + mem_fence_global(); + old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) 1); + ((__local bool *) sync_arr_mem_6967)[(int64_t) 0] = old_counter_6987 == sext_i64_i32(blocks_per_segment_6932 - (int64_t) 1); + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + is_last_block_6988 = ((__local bool *) sync_arr_mem_6967)[(int64_t) 0]; + if (is_last_block_6988) { + if (local_tid_6961 == 0) { + old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) sext_i64_i32((int64_t) 0 - blocks_per_segment_6932)); + } + // read in the per-block-results + { + int64_t read_per_thread_6989 = sdiv_up64(blocks_per_segment_6932, seghist_tblock_sizze_6305); + + eta_p_6313 = (int64_t) 0; + for (int64_t i_6990 = 0; i_6990 < read_per_thread_6989; i_6990++) { + int64_t block_res_id_6991 = sext_i32_i64(local_tid_6961) * read_per_thread_6989 + i_6990; + int64_t index_of_block_res_6992 = flat_segment_id_6973 * blocks_per_segment_6932 + block_res_id_6991; + + if (slt64(block_res_id_6991, blocks_per_segment_6932)) { + eta_p_6314 = ((__global int64_t *) segred_tmp_mem_6936)[index_of_block_res_6992]; + + int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314); + + eta_p_6313 = defunc_0_op_res_6315; + } + } + } + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6313; + barrier(CLK_LOCAL_MEM_FENCE); + // reduce the per-block results + { + int32_t offset_6993; + int32_t skip_waves_6994 = 1; + int64_t eta_p_6978; + int64_t eta_p_6979; + + offset_6993 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6305))) { + eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)]; + } + } + offset_6993 = 1; + while (slt32(offset_6993, wave_sizze_6963)) { + if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6993 - 1)) == 0) { + // read array element + { + eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + offset_6993 *= 2; + } + while (slt32(skip_waves_6994, squot32(sext_i64_i32(seghist_tblock_sizze_6305) + wave_sizze_6963 - 1, wave_sizze_6963))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_6993 = skip_waves_6994 * wave_sizze_6963; + if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6994 - 1)) == 0)) { + // read array element + { + eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + skip_waves_6994 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + // and back to memory with the final result + { + if (local_tid_6961 == 0) { + ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_6978; + } + } + } + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_6: + return; + #undef seghist_tblock_sizze_6305 + #undef chunk_sizze_6902 +} +FUTHARK_KERNEL_SIZED(byte_histogramzisegred_small_6901_dim1, 1, 1) +void byte_histogramzisegred_small_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6307, int64_t num_subhistos_6815, int64_t segment_sizze_nonzzero_6903, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816) +{ + #define seghist_tblock_sizze_6305 (byte_histogramzisegred_small_6901ziseghist_tblock_sizze_6305) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *red_arr_i64_mem_6910_backing_0 = &shared_mem[0]; + const int64_t red_arr_i64_mem_6910_backing_0_offset = 0 + ((int64_t) 8 * seghist_tblock_sizze_6305 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6305, (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6906; + int32_t tblock_sizze_6909; + int32_t wave_sizze_6908; + int32_t block_id_6907; + int32_t global_tid_6905; + int64_t flat_gtid_6901; + __local unsigned char *red_arr_i64_mem_6910; + int32_t phys_tblock_id_6912; + int32_t iterations_6913; + + local_tid_6906 = get_local_id(0); + tblock_sizze_6909 = get_local_size(0); + wave_sizze_6908 = LOCKSTEP_WIDTH; + block_id_6907 = get_tblock_id(0); + global_tid_6905 = block_id_6907 * tblock_sizze_6909 + local_tid_6906; + flat_gtid_6901 = sext_i32_i64(global_tid_6905); + red_arr_i64_mem_6910 = (__local unsigned char *) red_arr_i64_mem_6910_backing_0; + phys_tblock_id_6912 = get_tblock_id(0); + iterations_6913 = sdiv_up32(sext_i64_i32(sdiv_up64((int64_t) 256, squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903))) - phys_tblock_id_6912, sext_i64_i32(num_tblocks_6307)); + for (int32_t i_6914 = 0; i_6914 < iterations_6913; i_6914++) { + int32_t virt_tblock_id_6915; + int64_t slice_6916; + int64_t bucket_id_6899; + int64_t remnant_6917; + int64_t subhistogram_id_6900; + + virt_tblock_id_6915 = phys_tblock_id_6912 + i_6914 * sext_i64_i32(num_tblocks_6307); + slice_6916 = (int64_t) 256; + bucket_id_6899 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903); + remnant_6917 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903) - bucket_id_6899; + subhistogram_id_6900 = srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815); + // apply map function if in bounds + { + if (slt64((int64_t) 0, num_subhistos_6815) && (slt64(bucket_id_6899, (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903)))) { + // save results to be reduced + { + int64_t tmp_6918 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899]; + + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = tmp_6918; + } + } else { + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = (int64_t) 0; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (slt64((int64_t) 0, num_subhistos_6815)) { + // perform segmented scan to imitate reduction + { + int64_t eta_p_6313; + int64_t eta_p_6314; + int64_t eta_p_6919; + int64_t eta_p_6920; + bool ltid_in_bounds_6922 = slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903)); + int32_t skip_threads_6923; + + // read input for in-block scan + { + if (ltid_in_bounds_6922) { + eta_p_6314 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)]; + if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) { + eta_p_6313 = eta_p_6314; + } + } + } + // in-block scan (hopefully no barriers needed) + { + skip_threads_6923 = 1; + while (slt32(skip_threads_6923, 32)) { + bool thread_active_6924 = sle32(skip_threads_6923, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && ltid_in_bounds_6922; + + if (thread_active_6924) { + // read operands + { + eta_p_6313 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6923)]; + } + } + // perform operation + { + bool inactive_6925 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(local_tid_6906 - skip_threads_6923)); + + if (thread_active_6924 && inactive_6925) { + eta_p_6313 = eta_p_6314; + } + if (thread_active_6924) { + if (!inactive_6925) { + int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314); + + eta_p_6313 = defunc_0_op_res_6315; + } + } + } + if (sle32(wave_sizze_6908, skip_threads_6923)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + if (thread_active_6924) { + // write result + { + ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6313; + eta_p_6314 = eta_p_6313; + } + } + if (sle32(wave_sizze_6908, skip_threads_6923)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + skip_threads_6923 *= 2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // last thread of block 'i' writes its result to offset 'i' + { + if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 31 && ltid_in_bounds_6922) { + ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32))] = eta_p_6313; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // scan the first block, after which offset 'i' contains carry-in for block 'i+1' + { + int32_t skip_threads_6926; + + // read input for in-block scan + { + if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) { + eta_p_6920 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)]; + if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) { + eta_p_6919 = eta_p_6920; + } + } + } + // in-block scan (hopefully no barriers needed) + { + skip_threads_6926 = 1; + while (slt32(skip_threads_6926, 32)) { + bool thread_active_6927 = sle32(skip_threads_6926, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922); + + if (thread_active_6927) { + // read operands + { + eta_p_6919 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6926)]; + } + } + // perform operation + { + bool inactive_6928 = slt64(srem64(sext_i32_i64(local_tid_6906 * 32 + 32 - 1), num_subhistos_6815), sext_i32_i64(local_tid_6906 * 32 + 32 - 1) - sext_i32_i64((local_tid_6906 - skip_threads_6926) * 32 + 32 - 1)); + + if (thread_active_6927 && inactive_6928) { + eta_p_6919 = eta_p_6920; + } + if (thread_active_6927) { + if (!inactive_6928) { + int64_t defunc_0_op_res_6921 = add64(eta_p_6919, eta_p_6920); + + eta_p_6919 = defunc_0_op_res_6921; + } + } + } + if (sle32(wave_sizze_6908, skip_threads_6926)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + if (thread_active_6927) { + // write result + { + ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6919; + eta_p_6920 = eta_p_6919; + } + } + if (sle32(wave_sizze_6908, skip_threads_6926)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + skip_threads_6926 *= 2; + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + bool no_carry_in_6929 = squot32(local_tid_6906, 32) == 0 || !ltid_in_bounds_6922; + + // carry-in for every block except the first + { + // read operands + { + if (!no_carry_in_6929) { + eta_p_6314 = eta_p_6313; + eta_p_6313 = ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32)) - (int64_t) 1]; + } + } + // perform operation + { + bool inactive_6930 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(squot32(local_tid_6906, 32) * 32 - 1)); + + if (!no_carry_in_6929) { + if (inactive_6930) { + eta_p_6313 = eta_p_6314; + } + } + if (!no_carry_in_6929) { + if (!inactive_6930) { + int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314); + + eta_p_6313 = defunc_0_op_res_6315; + } + } + } + // write final result + { + if (!no_carry_in_6929) { + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6313; + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // restore correct values for first block + { + if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) { + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6314; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // save final values of segments + { + if (slt64(sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906), (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903))) { + int64_t tmp_6931 = ((__local int64_t *) red_arr_i64_mem_6910)[(sext_i32_i64(local_tid_6906) + (int64_t) 1) * segment_sizze_nonzzero_6903 - (int64_t) 1]; + + ((__global int64_t *) mem_6758)[sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906)] = tmp_6931; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_3: + return; + #undef seghist_tblock_sizze_6305 +} +FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6405_dim1, 1, 1) +void chunked_entropyzisegmap_6405(__local uint64_t *shared_mem_aligned, __global int *global_failure, int failure_is_an_option, __global int64_t *global_failure_args, int64_t n_6046, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, __global unsigned char *xs_mem_6757, __global unsigned char *mem_6791) +{ + #define segmap_tblock_sizze_6401 (chunked_entropyzisegmap_6405zisegmap_tblock_sizze_6401) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + + if (*global_failure >= 0) + return; + + int32_t local_tid_6799; + int32_t tblock_sizze_6802; + int32_t wave_sizze_6801; + int32_t block_id_6800; + int32_t global_tid_6798; + int64_t phys_tid_6405; + int64_t global_tid_6803; + int64_t slice_6804; + int64_t gtid_6404; + int64_t remnant_6805; + + local_tid_6799 = get_local_id(0); + tblock_sizze_6802 = get_local_size(0); + wave_sizze_6801 = LOCKSTEP_WIDTH; + block_id_6800 = get_tblock_id(0); + global_tid_6798 = block_id_6800 * tblock_sizze_6802 + local_tid_6799; + phys_tid_6405 = sext_i32_i64(global_tid_6798); + global_tid_6803 = sext_i32_i64(block_id_6800) * segmap_tblock_sizze_6401 + sext_i32_i64(local_tid_6799); + slice_6804 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197; + gtid_6404 = global_tid_6803; + remnant_6805 = global_tid_6803 - gtid_6404; + if (slt64(gtid_6404, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) { + int64_t entropy_arg0_6407; + int64_t zt_lhs_6408; + int64_t entropy_arg0_6409; + int64_t j_m_i_6410; + bool empty_slice_6411; + int64_t m_6412; + int64_t i_p_m_t_s_6413; + bool zzero_leq_i_p_m_t_s_6414; + bool i_p_m_t_s_leq_w_6415; + bool zzero_lte_i_6416; + bool i_lte_j_6417; + bool y_6418; + bool y_6419; + bool forwards_ok_6420; + bool ok_or_empty_6421; + bool index_certs_6422; + int64_t mem_6788[(int64_t) 256]; + float i64_res_6433; + float defunc_0_f_res_6434; + float redout_6742; + float zs_lhs_6445; + float log2_res_6446; + float lifted_lambda_res_6447; + float floor_arg0_6448; + float floor_res_6449; + int8_t unsign_arg0_6450; + + entropy_arg0_6407 = mul64(chunk_sizze_6047, gtid_6404); + zt_lhs_6408 = add64((int64_t) 1, gtid_6404); + entropy_arg0_6409 = mul64(chunk_sizze_6047, zt_lhs_6408); + j_m_i_6410 = sub64(entropy_arg0_6409, entropy_arg0_6407); + empty_slice_6411 = j_m_i_6410 == (int64_t) 0; + m_6412 = sub64(j_m_i_6410, (int64_t) 1); + i_p_m_t_s_6413 = add64(entropy_arg0_6407, m_6412); + zzero_leq_i_p_m_t_s_6414 = sle64((int64_t) 0, i_p_m_t_s_6413); + i_p_m_t_s_leq_w_6415 = slt64(i_p_m_t_s_6413, n_6046); + zzero_lte_i_6416 = sle64((int64_t) 0, entropy_arg0_6407); + i_lte_j_6417 = sle64(entropy_arg0_6407, entropy_arg0_6409); + y_6418 = i_p_m_t_s_leq_w_6415 && zzero_lte_i_6416; + y_6419 = zzero_leq_i_p_m_t_s_6414 && y_6418; + forwards_ok_6420 = i_lte_j_6417 && y_6419; + ok_or_empty_6421 = empty_slice_6411 || forwards_ok_6420; + if (!ok_or_empty_6421) { + { + if (atomic_cmpxchg_i32_global(global_failure, -1, 0) == -1) { + global_failure_args[0] = (int64_t) entropy_arg0_6407; + global_failure_args[1] = (int64_t) entropy_arg0_6409; + global_failure_args[2] = (int64_t) n_6046; + ; + } + return; + } + } + for (int64_t nest_i_6806 = 0; nest_i_6806 < (int64_t) 256; nest_i_6806++) { + mem_6788[nest_i_6806] = (int64_t) 0; + } + for (int64_t iter_6731 = 0; iter_6731 < j_m_i_6410; iter_6731++) { + int64_t slice_6755; + int8_t pixel_6733; + int64_t u8_res_6432; + bool less_than_zzero_6735; + bool greater_than_sizze_6736; + bool outside_bounds_dim_6737; + + slice_6755 = entropy_arg0_6407 + iter_6731; + pixel_6733 = ((__global int8_t *) xs_mem_6757)[slice_6755]; + u8_res_6432 = zext_i8_i64(pixel_6733); + less_than_zzero_6735 = slt64(u8_res_6432, (int64_t) 0); + greater_than_sizze_6736 = sle64((int64_t) 256, u8_res_6432); + outside_bounds_dim_6737 = less_than_zzero_6735 || greater_than_sizze_6736; + if (!outside_bounds_dim_6737) { + int64_t read_hist_6739; + int64_t defunc_0_op_res_6429; + + read_hist_6739 = mem_6788[u8_res_6432]; + defunc_0_op_res_6429 = add64((int64_t) 1, read_hist_6739); + mem_6788[u8_res_6432] = defunc_0_op_res_6429; + } + } + i64_res_6433 = sitofp_i64_f32(j_m_i_6410); + redout_6742 = 0.0F; + for (int64_t i_6743 = 0; i_6743 < (int64_t) 256; i_6743++) { + int64_t eta_p_6438; + float i64_res_6439; + float lifted_lambda_res_6440; + bool cond_6441; + float lifted_lambda_res_6442; + float defunc_0_op_res_6437; + float redout_tmp_6808; + + eta_p_6438 = mem_6788[i_6743]; + i64_res_6439 = sitofp_i64_f32(eta_p_6438); + lifted_lambda_res_6440 = i64_res_6439 / i64_res_6433; + cond_6441 = lifted_lambda_res_6440 == 0.0F; + if (cond_6441) { + lifted_lambda_res_6442 = 0.0F; + } else { + float log2_res_6443; + float lifted_lambda_res_f_res_6444; + + log2_res_6443 = futrts_log2_32(lifted_lambda_res_6440); + lifted_lambda_res_f_res_6444 = lifted_lambda_res_6440 * log2_res_6443; + lifted_lambda_res_6442 = lifted_lambda_res_f_res_6444; + } + defunc_0_op_res_6437 = lifted_lambda_res_6442 + redout_6742; + redout_tmp_6808 = defunc_0_op_res_6437; + redout_6742 = redout_tmp_6808; + } + defunc_0_f_res_6434 = redout_6742; + zs_lhs_6445 = -1.0F * defunc_0_f_res_6434; + log2_res_6446 = futrts_log2_32(i64_res_6433); + lifted_lambda_res_6447 = zs_lhs_6445 / log2_res_6446; + floor_arg0_6448 = 255.0F * lifted_lambda_res_6447; + floor_res_6449 = futrts_floor32(floor_arg0_6448); + unsign_arg0_6450 = fptoui_f32_i8(floor_res_6449); + ((__global int8_t *) mem_6791)[gtid_6404] = unsign_arg0_6450; + } + + error_0: + return; + #undef segmap_tblock_sizze_6401 +} +FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6606_dim1, 1, 1) +void chunked_entropyzisegmap_6606(__local uint64_t *shared_mem_aligned, __global int *global_failure, int failure_is_an_option, __global int64_t *global_failure_args, int64_t n_6046, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, __global unsigned char *mem_6759) +{ + #define segmap_tblock_sizze_6600 (chunked_entropyzisegmap_6606zisegmap_tblock_sizze_6600) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + + if (*global_failure >= 0) + return; + + int32_t local_tid_6811; + int32_t tblock_sizze_6814; + int32_t wave_sizze_6813; + int32_t block_id_6812; + int32_t global_tid_6810; + int64_t phys_tid_6606; + int64_t global_tid_6815; + int64_t slice_6816; + int64_t gtid_6605; + int64_t remnant_6817; + + local_tid_6811 = get_local_id(0); + tblock_sizze_6814 = get_local_size(0); + wave_sizze_6813 = LOCKSTEP_WIDTH; + block_id_6812 = get_tblock_id(0); + global_tid_6810 = block_id_6812 * tblock_sizze_6814 + local_tid_6811; + phys_tid_6606 = sext_i32_i64(global_tid_6810); + global_tid_6815 = sext_i32_i64(block_id_6812) * segmap_tblock_sizze_6600 + sext_i32_i64(local_tid_6811); + slice_6816 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197; + gtid_6605 = global_tid_6815; + remnant_6817 = global_tid_6815 - gtid_6605; + if (slt64(gtid_6605, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) { + int64_t entropy_arg0_6608; + int64_t zt_lhs_6609; + int64_t entropy_arg0_6610; + int64_t j_m_i_6611; + bool empty_slice_6612; + int64_t m_6613; + int64_t i_p_m_t_s_6614; + bool zzero_leq_i_p_m_t_s_6615; + bool i_p_m_t_s_leq_w_6616; + bool zzero_lte_i_6617; + bool i_lte_j_6618; + bool y_6619; + bool y_6620; + bool forwards_ok_6621; + bool ok_or_empty_6622; + bool index_certs_6623; + + entropy_arg0_6608 = mul64(chunk_sizze_6047, gtid_6605); + zt_lhs_6609 = add64((int64_t) 1, gtid_6605); + entropy_arg0_6610 = mul64(chunk_sizze_6047, zt_lhs_6609); + j_m_i_6611 = sub64(entropy_arg0_6610, entropy_arg0_6608); + empty_slice_6612 = j_m_i_6611 == (int64_t) 0; + m_6613 = sub64(j_m_i_6611, (int64_t) 1); + i_p_m_t_s_6614 = add64(entropy_arg0_6608, m_6613); + zzero_leq_i_p_m_t_s_6615 = sle64((int64_t) 0, i_p_m_t_s_6614); + i_p_m_t_s_leq_w_6616 = slt64(i_p_m_t_s_6614, n_6046); + zzero_lte_i_6617 = sle64((int64_t) 0, entropy_arg0_6608); + i_lte_j_6618 = sle64(entropy_arg0_6608, entropy_arg0_6610); + y_6619 = i_p_m_t_s_leq_w_6616 && zzero_lte_i_6617; + y_6620 = zzero_leq_i_p_m_t_s_6615 && y_6619; + forwards_ok_6621 = i_lte_j_6618 && y_6620; + ok_or_empty_6622 = empty_slice_6612 || forwards_ok_6621; + if (!ok_or_empty_6622) { + { + if (atomic_cmpxchg_i32_global(global_failure, -1, 1) == -1) { + global_failure_args[0] = (int64_t) entropy_arg0_6608; + global_failure_args[1] = (int64_t) entropy_arg0_6610; + global_failure_args[2] = (int64_t) n_6046; + ; + } + return; + } + } + } + + error_0: + return; + #undef segmap_tblock_sizze_6600 +} +FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6645_dim1, 1, 1) +void chunked_entropyzisegmap_6645(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, int64_t num_tblocks_6640, int32_t virt_num_tblocks_6818, __global unsigned char *xs_mem_6757, __global unsigned char *mem_6759, __global unsigned char *mem_6777) +{ + #define segmap_tblock_sizze_6639 (chunked_entropyzisegmap_6645zisegmap_tblock_sizze_6639) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + + if (*global_failure >= 0) + return; + + int32_t local_tid_6820; + int32_t tblock_sizze_6823; + int32_t wave_sizze_6822; + int32_t block_id_6821; + int32_t global_tid_6819; + int64_t phys_tid_6645; + int32_t phys_tblock_id_6824; + int32_t iterations_6825; + + local_tid_6820 = get_local_id(0); + tblock_sizze_6823 = get_local_size(0); + wave_sizze_6822 = LOCKSTEP_WIDTH; + block_id_6821 = get_tblock_id(0); + global_tid_6819 = block_id_6821 * tblock_sizze_6823 + local_tid_6820; + phys_tid_6645 = sext_i32_i64(global_tid_6819); + phys_tblock_id_6824 = get_tblock_id(0); + iterations_6825 = sdiv_up32(virt_num_tblocks_6818 - phys_tblock_id_6824, sext_i64_i32(num_tblocks_6640)); + for (int32_t i_6826 = 0; i_6826 < iterations_6825; i_6826++) { + int32_t virt_tblock_id_6827; + int64_t global_tid_6828; + int64_t slice_6829; + int64_t gtid_6644; + int64_t remnant_6830; + + virt_tblock_id_6827 = phys_tblock_id_6824 + i_6826 * sext_i64_i32(num_tblocks_6640); + global_tid_6828 = sext_i32_i64(virt_tblock_id_6827) * segmap_tblock_sizze_6639 + sext_i32_i64(local_tid_6820); + slice_6829 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197; + gtid_6644 = global_tid_6828; + remnant_6830 = global_tid_6828 - gtid_6644; + if (slt64(gtid_6644, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) { + int64_t index_primexp_6712; + int64_t binop_y_6704; + int64_t binop_x_6705; + int64_t index_primexp_6709; + bool index_certs_6648; + int64_t mem_6765[(int64_t) 256]; + + index_primexp_6712 = mul64(chunk_sizze_6047, gtid_6644); + binop_y_6704 = add64((int64_t) 1, gtid_6644); + binop_x_6705 = mul64(chunk_sizze_6047, binop_y_6704); + index_primexp_6709 = sub64(binop_x_6705, index_primexp_6712); + index_certs_6648 = 0; + for (int64_t nest_i_6831 = 0; nest_i_6831 < (int64_t) 256; nest_i_6831++) { + mem_6765[nest_i_6831] = (int64_t) 0; + } + for (int64_t iter_6744 = 0; iter_6744 < index_primexp_6709; iter_6744++) { + int64_t slice_6756; + int8_t pixel_6746; + int64_t u8_res_6659; + bool less_than_zzero_6748; + bool greater_than_sizze_6749; + bool outside_bounds_dim_6750; + + slice_6756 = index_primexp_6712 + iter_6744; + pixel_6746 = ((__global int8_t *) xs_mem_6757)[slice_6756]; + u8_res_6659 = zext_i8_i64(pixel_6746); + less_than_zzero_6748 = slt64(u8_res_6659, (int64_t) 0); + greater_than_sizze_6749 = sle64((int64_t) 256, u8_res_6659); + outside_bounds_dim_6750 = less_than_zzero_6748 || greater_than_sizze_6749; + if (!outside_bounds_dim_6750) { + int64_t read_hist_6752; + int64_t defunc_0_op_res_6656; + + read_hist_6752 = mem_6765[u8_res_6659]; + defunc_0_op_res_6656 = add64((int64_t) 1, read_hist_6752); + mem_6765[u8_res_6659] = defunc_0_op_res_6656; + } + } + for (int64_t i_0 = 0; i_0 < (int64_t) 256; i_0++) { + ((__global int64_t *) mem_6777)[gtid_6644 + i_0 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197] = mem_6765[i_0]; + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_1: + return; + #undef segmap_tblock_sizze_6639 +} +FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6687_dim1, 1, 1) +void chunked_entropyzisegmap_6687(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, __global unsigned char *mem_6780, __global unsigned char *mem_6782) +{ + #define segmap_tblock_sizze_6683 (chunked_entropyzisegmap_6687zisegmap_tblock_sizze_6683) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + + if (*global_failure >= 0) + return; + + int32_t local_tid_6927; + int32_t tblock_sizze_6930; + int32_t wave_sizze_6929; + int32_t block_id_6928; + int32_t global_tid_6926; + int64_t phys_tid_6687; + int64_t global_tid_6931; + int64_t slice_6932; + int64_t gtid_6686; + int64_t remnant_6933; + + local_tid_6927 = get_local_id(0); + tblock_sizze_6930 = get_local_size(0); + wave_sizze_6929 = LOCKSTEP_WIDTH; + block_id_6928 = get_tblock_id(0); + global_tid_6926 = block_id_6928 * tblock_sizze_6930 + local_tid_6927; + phys_tid_6687 = sext_i32_i64(global_tid_6926); + global_tid_6931 = sext_i32_i64(block_id_6928) * segmap_tblock_sizze_6683 + sext_i32_i64(local_tid_6927); + slice_6932 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197; + gtid_6686 = global_tid_6931; + remnant_6933 = global_tid_6931 - gtid_6686; + if (slt64(gtid_6686, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) { + int64_t binop_y_6715; + int64_t binop_x_6716; + int64_t binop_y_6719; + int64_t convop_x_6720; + float index_primexp_6721; + float defunc_0_f_res_6689; + float zs_lhs_6690; + float log2_res_6691; + float lifted_lambda_res_6692; + float floor_arg0_6693; + float floor_res_6694; + int8_t unsign_arg0_6695; + + binop_y_6715 = add64((int64_t) 1, gtid_6686); + binop_x_6716 = mul64(chunk_sizze_6047, binop_y_6715); + binop_y_6719 = mul64(chunk_sizze_6047, gtid_6686); + convop_x_6720 = sub64(binop_x_6716, binop_y_6719); + index_primexp_6721 = sitofp_i64_f32(convop_x_6720); + defunc_0_f_res_6689 = ((__global float *) mem_6780)[gtid_6686]; + zs_lhs_6690 = -1.0F * defunc_0_f_res_6689; + log2_res_6691 = futrts_log2_32(index_primexp_6721); + lifted_lambda_res_6692 = zs_lhs_6690 / log2_res_6691; + floor_arg0_6693 = 255.0F * lifted_lambda_res_6692; + floor_res_6694 = futrts_floor32(floor_arg0_6693); + unsign_arg0_6695 = fptoui_f32_i8(floor_res_6694); + ((__global int8_t *) mem_6782)[gtid_6686] = unsign_arg0_6695; + } + + error_0: + return; + #undef segmap_tblock_sizze_6683 +} +FUTHARK_KERNEL_SIZED(chunked_entropyzisegred_large_6669_dim1, 1, 1) +void chunked_entropyzisegred_large_6669(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, int64_t num_tblocks_6664, int64_t blocks_per_segment_6862, int64_t q_6863, int64_t num_virtblocks_6864, int64_t threads_per_segment_6865, __global unsigned char *mem_6777, __global unsigned char *mem_6780, __global unsigned char *segred_tmp_mem_6866, __global unsigned char *counters_mem_6868) +{ + #define segred_tblock_sizze_6663 (chunked_entropyzisegred_large_6669zisegred_tblock_sizze_6663) + #define chunk_sizze_6833 (chunked_entropyzisegred_large_6669zichunk_sizze_6833) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *sync_arr_mem_6897_backing_1 = &shared_mem[0]; + const int64_t sync_arr_mem_6897_backing_1_offset = 0 + 8; + volatile __local unsigned char *red_arr_f32_mem_6895_backing_0 = &shared_mem[sync_arr_mem_6897_backing_1_offset]; + const int64_t red_arr_f32_mem_6895_backing_0_offset = sync_arr_mem_6897_backing_1_offset + ((int64_t) 4 * segred_tblock_sizze_6663 + srem64((int64_t) 8 - srem64((int64_t) 4 * segred_tblock_sizze_6663, (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6891; + int32_t tblock_sizze_6894; + int32_t wave_sizze_6893; + int32_t block_id_6892; + int32_t global_tid_6890; + int64_t phys_tid_6669; + __local unsigned char *red_arr_f32_mem_6895; + __local unsigned char *sync_arr_mem_6897; + int32_t phys_tblock_id_6899; + int32_t iterations_6900; + + local_tid_6891 = get_local_id(0); + tblock_sizze_6894 = get_local_size(0); + wave_sizze_6893 = LOCKSTEP_WIDTH; + block_id_6892 = get_tblock_id(0); + global_tid_6890 = block_id_6892 * tblock_sizze_6894 + local_tid_6891; + phys_tid_6669 = sext_i32_i64(global_tid_6890); + red_arr_f32_mem_6895 = (__local unsigned char *) red_arr_f32_mem_6895_backing_0; + sync_arr_mem_6897 = (__local unsigned char *) sync_arr_mem_6897_backing_1; + phys_tblock_id_6899 = get_tblock_id(0); + iterations_6900 = sdiv_up32(sext_i64_i32(num_virtblocks_6864) - phys_tblock_id_6899, sext_i64_i32(num_tblocks_6664)); + for (int32_t i_6901 = 0; i_6901 < iterations_6900; i_6901++) { + int32_t virt_tblock_id_6902; + int64_t flat_segment_id_6903; + int64_t global_tid_6904; + int64_t slice_6905; + int64_t gtid_6667; + int64_t remnant_6906; + int64_t gtid_6668; + float eta_p_block_res_acc_6907; + float eta_p_6670; + float eta_p_6671; + int64_t tblock_id_in_segment_6911; + int64_t block_base_offset_6912; + int32_t offset_6915; + int32_t skip_waves_6916; + float eta_p_6908; + float eta_p_6909; + + virt_tblock_id_6902 = phys_tblock_id_6899 + i_6901 * sext_i64_i32(num_tblocks_6664); + flat_segment_id_6903 = squot64(sext_i32_i64(virt_tblock_id_6902), blocks_per_segment_6862); + global_tid_6904 = srem64(sext_i32_i64(virt_tblock_id_6902) * segred_tblock_sizze_6663 + sext_i32_i64(local_tid_6891), threads_per_segment_6865); + slice_6905 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197; + gtid_6667 = flat_segment_id_6903; + remnant_6906 = flat_segment_id_6903 - gtid_6667; + // ne-initialise the outer (per-block) accumulator(s) + { + eta_p_block_res_acc_6907 = 0.0F; + } + tblock_id_in_segment_6911 = squot64(global_tid_6904, segred_tblock_sizze_6663); + block_base_offset_6912 = tblock_id_in_segment_6911 * q_6863 * segred_tblock_sizze_6663; + for (int64_t i_6913 = 0; i_6913 < q_6863; i_6913++) { + int64_t block_offset_6914 = block_base_offset_6912 + i_6913 * segred_tblock_sizze_6663; + + gtid_6668 = global_tid_6904 + threads_per_segment_6865 * i_6913; + if (slt64(gtid_6668, (int64_t) 256)) { + // apply map function(s) + { + // apply map function + { + int64_t binop_y_6724 = add64((int64_t) 1, gtid_6667); + int64_t binop_x_6725 = mul64(chunk_sizze_6047, binop_y_6724); + int64_t binop_y_6728 = mul64(chunk_sizze_6047, gtid_6667); + int64_t convop_x_6729 = sub64(binop_x_6725, binop_y_6728); + float index_primexp_6730 = sitofp_i64_f32(convop_x_6729); + int64_t eta_p_6675 = ((__global int64_t *) mem_6777)[gtid_6667 + gtid_6668 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197]; + float i64_res_6676 = sitofp_i64_f32(eta_p_6675); + float lifted_lambda_res_6677 = i64_res_6676 / index_primexp_6730; + bool cond_6678 = lifted_lambda_res_6677 == 0.0F; + float lifted_lambda_res_6679; + + if (cond_6678) { + lifted_lambda_res_6679 = 0.0F; + } else { + float log2_res_6680 = futrts_log2_32(lifted_lambda_res_6677); + float lifted_lambda_res_f_res_6681 = lifted_lambda_res_6677 * log2_res_6680; + + lifted_lambda_res_6679 = lifted_lambda_res_f_res_6681; + } + // load accumulator(s) + { + eta_p_6670 = eta_p_block_res_acc_6907; + } + // load next value(s) + { + eta_p_6671 = lifted_lambda_res_6679; + } + // apply reduction operator(s) + { + float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671; + + // store in accumulator(s) + { + eta_p_block_res_acc_6907 = defunc_0_op_res_6672; + } + } + } + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // store accs. prims go in lmem; non-prims in params (in global mem) + { + ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_block_res_acc_6907; + } + barrier(CLK_LOCAL_MEM_FENCE); + skip_waves_6916 = 1; + offset_6915 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_6891, sext_i64_i32(segred_tblock_sizze_6663))) { + eta_p_6908 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6915)]; + } + } + offset_6915 = 1; + while (slt32(offset_6915, wave_sizze_6893)) { + if (slt32(local_tid_6891 + offset_6915, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) & (2 * offset_6915 - 1)) == 0) { + // read array element + { + eta_p_6909 = ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6915)]; + } + // apply reduction operation + { + float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909; + + eta_p_6908 = defunc_0_op_res_6910; + } + // write result of operation + { + ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908; + } + } + offset_6915 *= 2; + } + while (slt32(skip_waves_6916, squot32(sext_i64_i32(segred_tblock_sizze_6663) + wave_sizze_6893 - 1, wave_sizze_6893))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_6915 = skip_waves_6916 * wave_sizze_6893; + if (slt32(local_tid_6891 + offset_6915, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) == 0 && (squot32(local_tid_6891, wave_sizze_6893) & (2 * skip_waves_6916 - 1)) == 0)) { + // read array element + { + eta_p_6909 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6915)]; + } + // apply reduction operation + { + float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909; + + eta_p_6908 = defunc_0_op_res_6910; + } + // write result of operation + { + ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908; + } + } + skip_waves_6916 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); + // thread 0 updates per-block acc(s); rest reset to ne + { + if (sext_i32_i64(local_tid_6891) == (int64_t) 0) { + eta_p_block_res_acc_6907 = eta_p_6908; + } else { + eta_p_block_res_acc_6907 = 0.0F; + } + } + if (blocks_per_segment_6862 == (int64_t) 1) { + // first thread in block saves final result to memory + { + if (local_tid_6891 == 0) { + ((__global float *) mem_6780)[gtid_6667] = eta_p_block_res_acc_6907; + } + } + } else { + int32_t old_counter_6917; + bool is_last_block_6918; + + // first thread in block saves block result to global memory + { + if (local_tid_6891 == 0) { + ((__global float *) segred_tmp_mem_6866)[sext_i32_i64(virt_tblock_id_6902)] = eta_p_block_res_acc_6907; + mem_fence_global(); + old_counter_6917 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6868)[srem64(flat_segment_id_6903, (int64_t) 20480)], (int) 1); + ((__local bool *) sync_arr_mem_6897)[(int64_t) 0] = old_counter_6917 == sext_i64_i32(blocks_per_segment_6862 - (int64_t) 1); + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + is_last_block_6918 = ((__local bool *) sync_arr_mem_6897)[(int64_t) 0]; + if (is_last_block_6918) { + if (local_tid_6891 == 0) { + old_counter_6917 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6868)[srem64(flat_segment_id_6903, (int64_t) 20480)], (int) sext_i64_i32((int64_t) 0 - blocks_per_segment_6862)); + } + // read in the per-block-results + { + int64_t read_per_thread_6919 = sdiv_up64(blocks_per_segment_6862, segred_tblock_sizze_6663); + + eta_p_6670 = 0.0F; + for (int64_t i_6920 = 0; i_6920 < read_per_thread_6919; i_6920++) { + int64_t block_res_id_6921 = sext_i32_i64(local_tid_6891) * read_per_thread_6919 + i_6920; + int64_t index_of_block_res_6922 = flat_segment_id_6903 * blocks_per_segment_6862 + block_res_id_6921; + + if (slt64(block_res_id_6921, blocks_per_segment_6862)) { + eta_p_6671 = ((__global float *) segred_tmp_mem_6866)[index_of_block_res_6922]; + + float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671; + + eta_p_6670 = defunc_0_op_res_6672; + } + } + } + ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6670; + barrier(CLK_LOCAL_MEM_FENCE); + // reduce the per-block results + { + int32_t offset_6923; + int32_t skip_waves_6924 = 1; + float eta_p_6908; + float eta_p_6909; + + offset_6923 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_6891, sext_i64_i32(segred_tblock_sizze_6663))) { + eta_p_6908 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6923)]; + } + } + offset_6923 = 1; + while (slt32(offset_6923, wave_sizze_6893)) { + if (slt32(local_tid_6891 + offset_6923, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) & (2 * offset_6923 - 1)) == 0) { + // read array element + { + eta_p_6909 = ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6923)]; + } + // apply reduction operation + { + float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909; + + eta_p_6908 = defunc_0_op_res_6910; + } + // write result of operation + { + ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908; + } + } + offset_6923 *= 2; + } + while (slt32(skip_waves_6924, squot32(sext_i64_i32(segred_tblock_sizze_6663) + wave_sizze_6893 - 1, wave_sizze_6893))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_6923 = skip_waves_6924 * wave_sizze_6893; + if (slt32(local_tid_6891 + offset_6923, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) == 0 && (squot32(local_tid_6891, wave_sizze_6893) & (2 * skip_waves_6924 - 1)) == 0)) { + // read array element + { + eta_p_6909 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6923)]; + } + // apply reduction operation + { + float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909; + + eta_p_6908 = defunc_0_op_res_6910; + } + // write result of operation + { + ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908; + } + } + skip_waves_6924 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + // and back to memory with the final result + { + if (local_tid_6891 == 0) { + ((__global float *) mem_6780)[gtid_6667] = eta_p_6908; + } + } + } + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_6: + return; + #undef segred_tblock_sizze_6663 + #undef chunk_sizze_6833 +} +FUTHARK_KERNEL_SIZED(chunked_entropyzisegred_small_6669_dim1, 1, 1) +void chunked_entropyzisegred_small_6669(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, int64_t num_tblocks_6664, int64_t segment_sizze_nonzzero_6834, __global unsigned char *mem_6777, __global unsigned char *mem_6780) +{ + #define segred_tblock_sizze_6663 (chunked_entropyzisegred_small_6669zisegred_tblock_sizze_6663) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *red_arr_f32_mem_6841_backing_0 = &shared_mem[0]; + const int64_t red_arr_f32_mem_6841_backing_0_offset = 0 + ((int64_t) 4 * segred_tblock_sizze_6663 + srem64((int64_t) 8 - srem64((int64_t) 4 * segred_tblock_sizze_6663, (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6837; + int32_t tblock_sizze_6840; + int32_t wave_sizze_6839; + int32_t block_id_6838; + int32_t global_tid_6836; + int64_t phys_tid_6669; + __local unsigned char *red_arr_f32_mem_6841; + int32_t phys_tblock_id_6843; + int32_t iterations_6844; + + local_tid_6837 = get_local_id(0); + tblock_sizze_6840 = get_local_size(0); + wave_sizze_6839 = LOCKSTEP_WIDTH; + block_id_6838 = get_tblock_id(0); + global_tid_6836 = block_id_6838 * tblock_sizze_6840 + local_tid_6837; + phys_tid_6669 = sext_i32_i64(global_tid_6836); + red_arr_f32_mem_6841 = (__local unsigned char *) red_arr_f32_mem_6841_backing_0; + phys_tblock_id_6843 = get_tblock_id(0); + iterations_6844 = sdiv_up32(sext_i64_i32(sdiv_up64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834))) - phys_tblock_id_6843, sext_i64_i32(num_tblocks_6664)); + for (int32_t i_6845 = 0; i_6845 < iterations_6844; i_6845++) { + int32_t virt_tblock_id_6846; + int64_t slice_6847; + int64_t gtid_6667; + int64_t remnant_6848; + int64_t gtid_6668; + + virt_tblock_id_6846 = phys_tblock_id_6843 + i_6845 * sext_i64_i32(num_tblocks_6664); + slice_6847 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197; + gtid_6667 = squot64(sext_i32_i64(local_tid_6837), segment_sizze_nonzzero_6834) + sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834); + remnant_6848 = squot64(sext_i32_i64(local_tid_6837), segment_sizze_nonzzero_6834) + sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834) - gtid_6667; + gtid_6668 = srem64(sext_i32_i64(local_tid_6837), (int64_t) 256); + // apply map function if in bounds + { + if (slt64((int64_t) 0, (int64_t) 256) && (slt64(gtid_6667, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197) && slt64(sext_i32_i64(local_tid_6837), (int64_t) 256 * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834)))) { + // apply map function + { + int64_t binop_y_6724 = add64((int64_t) 1, gtid_6667); + int64_t binop_x_6725 = mul64(chunk_sizze_6047, binop_y_6724); + int64_t binop_y_6728 = mul64(chunk_sizze_6047, gtid_6667); + int64_t convop_x_6729 = sub64(binop_x_6725, binop_y_6728); + float index_primexp_6730 = sitofp_i64_f32(convop_x_6729); + int64_t eta_p_6675 = ((__global int64_t *) mem_6777)[gtid_6667 + gtid_6668 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197]; + float i64_res_6676 = sitofp_i64_f32(eta_p_6675); + float lifted_lambda_res_6677 = i64_res_6676 / index_primexp_6730; + bool cond_6678 = lifted_lambda_res_6677 == 0.0F; + float lifted_lambda_res_6679; + + if (cond_6678) { + lifted_lambda_res_6679 = 0.0F; + } else { + float log2_res_6680 = futrts_log2_32(lifted_lambda_res_6677); + float lifted_lambda_res_f_res_6681 = lifted_lambda_res_6677 * log2_res_6680; + + lifted_lambda_res_6679 = lifted_lambda_res_f_res_6681; + } + // save results to be reduced + { + ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = lifted_lambda_res_6679; + } + } + } else { + ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = 0.0F; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (slt64((int64_t) 0, (int64_t) 256)) { + // perform segmented scan to imitate reduction + { + float eta_p_6670; + float eta_p_6671; + float eta_p_6849; + float eta_p_6850; + bool ltid_in_bounds_6852 = slt64(sext_i32_i64(local_tid_6837), (int64_t) 256 * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834)); + int32_t skip_threads_6853; + + // read input for in-block scan + { + if (ltid_in_bounds_6852) { + eta_p_6671 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)]; + if ((local_tid_6837 - squot32(local_tid_6837, 32) * 32) == 0) { + eta_p_6670 = eta_p_6671; + } + } + } + // in-block scan (hopefully no barriers needed) + { + skip_threads_6853 = 1; + while (slt32(skip_threads_6853, 32)) { + bool thread_active_6854 = sle32(skip_threads_6853, local_tid_6837 - squot32(local_tid_6837, 32) * 32) && ltid_in_bounds_6852; + + if (thread_active_6854) { + // read operands + { + eta_p_6670 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837) - sext_i32_i64(skip_threads_6853)]; + } + } + // perform operation + { + bool inactive_6855 = slt64(srem64(sext_i32_i64(local_tid_6837), (int64_t) 256), sext_i32_i64(local_tid_6837) - sext_i32_i64(local_tid_6837 - skip_threads_6853)); + + if (thread_active_6854 && inactive_6855) { + eta_p_6670 = eta_p_6671; + } + if (thread_active_6854) { + if (!inactive_6855) { + float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671; + + eta_p_6670 = defunc_0_op_res_6672; + } + } + } + if (sle32(wave_sizze_6839, skip_threads_6853)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + if (thread_active_6854) { + // write result + { + ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6670; + eta_p_6671 = eta_p_6670; + } + } + if (sle32(wave_sizze_6839, skip_threads_6853)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + skip_threads_6853 *= 2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // last thread of block 'i' writes its result to offset 'i' + { + if ((local_tid_6837 - squot32(local_tid_6837, 32) * 32) == 31 && ltid_in_bounds_6852) { + ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(squot32(local_tid_6837, 32))] = eta_p_6670; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // scan the first block, after which offset 'i' contains carry-in for block 'i+1' + { + int32_t skip_threads_6856; + + // read input for in-block scan + { + if (squot32(local_tid_6837, 32) == 0 && ltid_in_bounds_6852) { + eta_p_6850 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)]; + if ((local_tid_6837 - squot32(local_tid_6837, 32) * 32) == 0) { + eta_p_6849 = eta_p_6850; + } + } + } + // in-block scan (hopefully no barriers needed) + { + skip_threads_6856 = 1; + while (slt32(skip_threads_6856, 32)) { + bool thread_active_6857 = sle32(skip_threads_6856, local_tid_6837 - squot32(local_tid_6837, 32) * 32) && (squot32(local_tid_6837, 32) == 0 && ltid_in_bounds_6852); + + if (thread_active_6857) { + // read operands + { + eta_p_6849 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837) - sext_i32_i64(skip_threads_6856)]; + } + } + // perform operation + { + bool inactive_6858 = slt64(srem64(sext_i32_i64(local_tid_6837 * 32 + 32 - 1), (int64_t) 256), sext_i32_i64(local_tid_6837 * 32 + 32 - 1) - sext_i32_i64((local_tid_6837 - skip_threads_6856) * 32 + 32 - 1)); + + if (thread_active_6857 && inactive_6858) { + eta_p_6849 = eta_p_6850; + } + if (thread_active_6857) { + if (!inactive_6858) { + float defunc_0_op_res_6851 = eta_p_6849 + eta_p_6850; + + eta_p_6849 = defunc_0_op_res_6851; + } + } + } + if (sle32(wave_sizze_6839, skip_threads_6856)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + if (thread_active_6857) { + // write result + { + ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6849; + eta_p_6850 = eta_p_6849; + } + } + if (sle32(wave_sizze_6839, skip_threads_6856)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + skip_threads_6856 *= 2; + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + bool no_carry_in_6859 = squot32(local_tid_6837, 32) == 0 || !ltid_in_bounds_6852; + + // carry-in for every block except the first + { + // read operands + { + if (!no_carry_in_6859) { + eta_p_6671 = eta_p_6670; + eta_p_6670 = ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(squot32(local_tid_6837, 32)) - (int64_t) 1]; + } + } + // perform operation + { + bool inactive_6860 = slt64(srem64(sext_i32_i64(local_tid_6837), (int64_t) 256), sext_i32_i64(local_tid_6837) - sext_i32_i64(squot32(local_tid_6837, 32) * 32 - 1)); + + if (!no_carry_in_6859) { + if (inactive_6860) { + eta_p_6670 = eta_p_6671; + } + } + if (!no_carry_in_6859) { + if (!inactive_6860) { + float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671; + + eta_p_6670 = defunc_0_op_res_6672; + } + } + } + // write final result + { + if (!no_carry_in_6859) { + ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6670; + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // restore correct values for first block + { + if (squot32(local_tid_6837, 32) == 0 && ltid_in_bounds_6852) { + ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6671; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // save final values of segments + { + if (slt64(sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834) + sext_i32_i64(local_tid_6837), dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197) && slt64(sext_i32_i64(local_tid_6837), squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834))) { + float tmp_6861 = ((__local float *) red_arr_f32_mem_6841)[(sext_i32_i64(local_tid_6837) + (int64_t) 1) * segment_sizze_nonzzero_6834 - (int64_t) 1]; + + ((__global float *) mem_6780)[sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834) + sext_i32_i64(local_tid_6837)] = tmp_6861; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_3: + return; + #undef segred_tblock_sizze_6663 +} +FUTHARK_KERNEL_SIZED(entropyziseghist_global_6328_dim1, 1, 1) +void entropyziseghist_global_6328(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5907, int64_t num_tblocks_6323, int64_t num_subhistos_6815, int32_t chk_i_6885, int64_t hist_H_chk_6886, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816) +{ + #define seghist_tblock_sizze_6321 (entropyziseghist_global_6328ziseghist_tblock_sizze_6321) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + + if (*global_failure >= 0) + return; + + int32_t local_tid_6888; + int32_t tblock_sizze_6891; + int32_t wave_sizze_6890; + int32_t block_id_6889; + int32_t global_tid_6887; + int64_t phys_tid_6328; + int32_t subhisto_ind_6892; + int64_t num_chunks_6893; + + local_tid_6888 = get_local_id(0); + tblock_sizze_6891 = get_local_size(0); + wave_sizze_6890 = LOCKSTEP_WIDTH; + block_id_6889 = get_tblock_id(0); + global_tid_6887 = block_id_6889 * tblock_sizze_6891 + local_tid_6888; + phys_tid_6328 = sext_i32_i64(global_tid_6887); + subhisto_ind_6892 = squot32(global_tid_6887, sdiv_up32(sext_i64_i32(seghist_tblock_sizze_6321 * num_tblocks_6323), sext_i64_i32(num_subhistos_6815))); + num_chunks_6893 = sdiv_up64(n_5907, sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6321 * num_tblocks_6323))); + for (int64_t chunk_i_6894 = 0; chunk_i_6894 < num_chunks_6893; chunk_i_6894++) { + int64_t i_6895 = chunk_i_6894 * sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6321 * num_tblocks_6323)) + sext_i32_i64(global_tid_6887); + + if (slt64(i_6895, n_5907)) { + int64_t slice_6896; + int64_t gtid_6327; + int64_t remnant_6897; + + slice_6896 = n_5907; + gtid_6327 = i_6895; + remnant_6897 = i_6895 - gtid_6327; + if (slt64(i_6895, n_5907)) { + int8_t eta_p_6332; + int64_t u8_res_6334; + + eta_p_6332 = ((__global int8_t *) xs_mem_6757)[gtid_6327]; + u8_res_6334 = zext_i8_i64(eta_p_6332); + // save map-out results + { } + // perform atomic updates + { + if (sle64(sext_i32_i64(chk_i_6885) * hist_H_chk_6886, u8_res_6334) && (slt64(u8_res_6334, sext_i32_i64(chk_i_6885) * hist_H_chk_6886 + hist_H_chk_6886) && (sle64((int64_t) 0, u8_res_6334) && slt64(u8_res_6334, (int64_t) 256)))) { + int64_t eta_p_6329; + int64_t eta_p_6330 = (int64_t) 1; + int64_t old_6898; + + old_6898 = atomic_add_i64_global(&((volatile __global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(subhisto_ind_6892) * (int64_t) 256 + u8_res_6334], (int64_t) eta_p_6330); + } + } + } + } + } + + error_0: + return; + #undef seghist_tblock_sizze_6321 +} +FUTHARK_KERNEL_SIZED(entropyziseghist_local_6328_dim1, 1, 1) +void entropyziseghist_local_6328(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5907, int64_t num_subhistos_6815, int64_t num_tblocks_6826, int32_t hist_M_6832, int32_t chk_i_6836, int64_t num_segments_6837, int64_t hist_H_chk_6838, int64_t histo_sizze_6839, int32_t init_per_thread_6840, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816) +{ + #define max_tblock_sizze_6825 (entropyziseghist_local_6328zimax_tblock_sizze_6825) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *subhistogram_local_mem_6854_backing_0 = &shared_mem[0]; + const int64_t subhistogram_local_mem_6854_backing_0_offset = 0 + ((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838) + srem64((int64_t) 8 - srem64((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838), (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6842; + int32_t tblock_sizze_6845; + int32_t wave_sizze_6844; + int32_t block_id_6843; + int32_t global_tid_6841; + int64_t phys_tid_6328; + int32_t phys_tblock_id_6846; + int32_t iterations_6847; + + local_tid_6842 = get_local_id(0); + tblock_sizze_6845 = get_local_size(0); + wave_sizze_6844 = LOCKSTEP_WIDTH; + block_id_6843 = get_tblock_id(0); + global_tid_6841 = block_id_6843 * tblock_sizze_6845 + local_tid_6842; + phys_tid_6328 = sext_i32_i64(global_tid_6841); + phys_tblock_id_6846 = get_tblock_id(0); + iterations_6847 = sdiv_up32(sext_i64_i32(num_tblocks_6826 * num_segments_6837) - phys_tblock_id_6846, sext_i64_i32(num_tblocks_6826)); + for (int32_t i_6848 = 0; i_6848 < iterations_6847; i_6848++) { + int32_t virt_tblock_id_6849; + int32_t flat_segment_id_6850; + int32_t gid_in_segment_6851; + int32_t pgtid_in_segment_6852; + int32_t threads_per_segment_6853; + __local unsigned char *subhistogram_local_mem_6854; + int32_t thread_local_subhisto_i_6856; + int64_t num_chunks_6863; + + virt_tblock_id_6849 = phys_tblock_id_6846 + i_6848 * sext_i64_i32(num_tblocks_6826); + flat_segment_id_6850 = squot32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826)); + gid_in_segment_6851 = srem32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826)); + pgtid_in_segment_6852 = gid_in_segment_6851 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842; + threads_per_segment_6853 = sext_i64_i32(num_tblocks_6826 * max_tblock_sizze_6825); + subhistogram_local_mem_6854 = (__local unsigned char *) subhistogram_local_mem_6854_backing_0; + thread_local_subhisto_i_6856 = srem32(local_tid_6842, hist_M_6832); + // initialize histograms in shared memory + { + for (int32_t local_i_6857 = 0; local_i_6857 < init_per_thread_6840; local_i_6857++) { + int32_t j_6858 = local_i_6857 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842; + int32_t j_offset_6859 = hist_M_6832 * sext_i64_i32(histo_sizze_6839) * gid_in_segment_6851 + j_6858; + int32_t local_subhisto_i_6860 = squot32(j_6858, sext_i64_i32(histo_sizze_6839)); + int32_t global_subhisto_i_6861 = squot32(j_offset_6859, sext_i64_i32(histo_sizze_6839)); + + if (slt32(j_6858, hist_M_6832 * sext_i64_i32(histo_sizze_6839))) { + // First subhistogram is initialised from global memory; others with neutral element. + { + if (global_subhisto_i_6861 == 0 && ((sle64((int64_t) 0, (int64_t) 0) && slt64((int64_t) 0, num_subhistos_6815)) && (sle64((int64_t) 0, sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838) && slt64(sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838, (int64_t) 256)))) { + int64_t tmp_6862 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838]; + + ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = tmp_6862; + } else { + ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = (int64_t) 0; + } + } + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + num_chunks_6863 = sdiv_up64(n_5907, sext_i32_i64(threads_per_segment_6853)); + for (int64_t chunk_i_6864 = 0; chunk_i_6864 < num_chunks_6863; chunk_i_6864++) { + int64_t i_6865 = chunk_i_6864 * sext_i32_i64(threads_per_segment_6853) + sext_i32_i64(pgtid_in_segment_6852); + + if (slt64(i_6865, n_5907)) { + int64_t gtid_6327; + int8_t eta_p_6332; + int64_t u8_res_6334; + + gtid_6327 = i_6865; + eta_p_6332 = ((__global int8_t *) xs_mem_6757)[gtid_6327]; + u8_res_6334 = zext_i8_i64(eta_p_6332); + if (chk_i_6836 == 0) { + // save map-out results + { } + } + // perform atomic updates + { + if ((sle64((int64_t) 0, u8_res_6334) && slt64(u8_res_6334, (int64_t) 256)) && (sle64(sext_i32_i64(chk_i_6836) * hist_H_chk_6838, u8_res_6334) && slt64(u8_res_6334, sext_i32_i64(chk_i_6836) * hist_H_chk_6838 + hist_H_chk_6838))) { + int64_t eta_p_6329; + int64_t eta_p_6330 = (int64_t) 1; + int64_t old_6866; + + old_6866 = atomic_add_i64_shared(&((volatile __local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(thread_local_subhisto_i_6856) * hist_H_chk_6838 + (u8_res_6334 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838)], (int64_t) eta_p_6330); + } + } + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + // Compact the multiple shared memory subhistograms to result in global memory + { + int64_t trunc_H_6867 = smin64(hist_H_chk_6838, (int64_t) 256 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838); + int32_t histo_sizze_6868 = sext_i64_i32(trunc_H_6867); + + for (int32_t local_i_6869 = 0; local_i_6869 < init_per_thread_6840; local_i_6869++) { + int32_t j_6870 = local_i_6869 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842; + + if (slt32(j_6870, histo_sizze_6868)) { + int64_t eta_p_6329; + int64_t eta_p_6330; + + // Read values from subhistogram 0. + { + eta_p_6329 = ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(j_6870)]; + } + // Accumulate based on values in other subhistograms. + { + for (int32_t subhisto_id_6871 = 0; subhisto_id_6871 < hist_M_6832 - 1; subhisto_id_6871++) { + eta_p_6330 = ((__local int64_t *) subhistogram_local_mem_6854)[(sext_i32_i64(subhisto_id_6871) + (int64_t) 1) * hist_H_chk_6838 + sext_i32_i64(j_6870)]; + + int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330); + + eta_p_6329 = defunc_0_op_res_6331; + } + } + // Put final bucket value in global memory. + { + ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[srem64(sext_i32_i64(virt_tblock_id_6849), num_tblocks_6826) * (int64_t) 256 + (sext_i32_i64(j_6870) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838)] = eta_p_6329; + } + } + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_2: + return; + #undef max_tblock_sizze_6825 +} +FUTHARK_KERNEL_SIZED(entropyzisegred_large_6901_dim1, 1, 1) +void entropyzisegred_large_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6323, int64_t num_subhistos_6815, int64_t blocks_per_segment_6932, int64_t q_6933, int64_t num_virtblocks_6934, int64_t threads_per_segment_6935, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816, __global unsigned char *segred_tmp_mem_6936, __global unsigned char *counters_mem_6938) +{ + #define seghist_tblock_sizze_6321 (entropyzisegred_large_6901ziseghist_tblock_sizze_6321) + #define chunk_sizze_6902 (entropyzisegred_large_6901zichunk_sizze_6902) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *sync_arr_mem_6967_backing_1 = &shared_mem[0]; + const int64_t sync_arr_mem_6967_backing_1_offset = 0 + 8; + volatile __local unsigned char *red_arr_i64_mem_6965_backing_0 = &shared_mem[sync_arr_mem_6967_backing_1_offset]; + const int64_t red_arr_i64_mem_6965_backing_0_offset = sync_arr_mem_6967_backing_1_offset + ((int64_t) 8 * seghist_tblock_sizze_6321 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6321, (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6961; + int32_t tblock_sizze_6964; + int32_t wave_sizze_6963; + int32_t block_id_6962; + int32_t global_tid_6960; + int64_t flat_gtid_6901; + __local unsigned char *red_arr_i64_mem_6965; + __local unsigned char *sync_arr_mem_6967; + int32_t phys_tblock_id_6969; + int32_t iterations_6970; + + local_tid_6961 = get_local_id(0); + tblock_sizze_6964 = get_local_size(0); + wave_sizze_6963 = LOCKSTEP_WIDTH; + block_id_6962 = get_tblock_id(0); + global_tid_6960 = block_id_6962 * tblock_sizze_6964 + local_tid_6961; + flat_gtid_6901 = sext_i32_i64(global_tid_6960); + red_arr_i64_mem_6965 = (__local unsigned char *) red_arr_i64_mem_6965_backing_0; + sync_arr_mem_6967 = (__local unsigned char *) sync_arr_mem_6967_backing_1; + phys_tblock_id_6969 = get_tblock_id(0); + iterations_6970 = sdiv_up32(sext_i64_i32(num_virtblocks_6934) - phys_tblock_id_6969, sext_i64_i32(num_tblocks_6323)); + for (int32_t i_6971 = 0; i_6971 < iterations_6970; i_6971++) { + int32_t virt_tblock_id_6972; + int64_t flat_segment_id_6973; + int64_t global_tid_6974; + int64_t slice_6975; + int64_t bucket_id_6899; + int64_t remnant_6976; + int64_t subhistogram_id_6900; + int64_t eta_p_block_res_acc_6977; + int64_t eta_p_6329; + int64_t eta_p_6330; + int64_t tblock_id_in_segment_6981; + int64_t block_base_offset_6982; + int32_t offset_6985; + int32_t skip_waves_6986; + int64_t eta_p_6978; + int64_t eta_p_6979; + + virt_tblock_id_6972 = phys_tblock_id_6969 + i_6971 * sext_i64_i32(num_tblocks_6323); + flat_segment_id_6973 = squot64(sext_i32_i64(virt_tblock_id_6972), blocks_per_segment_6932); + global_tid_6974 = srem64(sext_i32_i64(virt_tblock_id_6972) * seghist_tblock_sizze_6321 + sext_i32_i64(local_tid_6961), threads_per_segment_6935); + slice_6975 = (int64_t) 256; + bucket_id_6899 = flat_segment_id_6973; + remnant_6976 = flat_segment_id_6973 - bucket_id_6899; + // ne-initialise the outer (per-block) accumulator(s) + { + eta_p_block_res_acc_6977 = (int64_t) 0; + } + tblock_id_in_segment_6981 = squot64(global_tid_6974, seghist_tblock_sizze_6321); + block_base_offset_6982 = tblock_id_in_segment_6981 * q_6933 * seghist_tblock_sizze_6321; + for (int64_t i_6983 = 0; i_6983 < q_6933; i_6983++) { + int64_t block_offset_6984 = block_base_offset_6982 + i_6983 * seghist_tblock_sizze_6321; + + subhistogram_id_6900 = global_tid_6974 + threads_per_segment_6935 * i_6983; + if (slt64(subhistogram_id_6900, num_subhistos_6815)) { + // apply map function(s) + { + // load accumulator(s) + { + eta_p_6329 = eta_p_block_res_acc_6977; + } + // load next value(s) + { + eta_p_6330 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899]; + } + // apply reduction operator(s) + { + int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330); + + // store in accumulator(s) + { + eta_p_block_res_acc_6977 = defunc_0_op_res_6331; + } + } + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // store accs. prims go in lmem; non-prims in params (in global mem) + { + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_block_res_acc_6977; + } + barrier(CLK_LOCAL_MEM_FENCE); + skip_waves_6986 = 1; + offset_6985 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6321))) { + eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)]; + } + } + offset_6985 = 1; + while (slt32(offset_6985, wave_sizze_6963)) { + if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6985 - 1)) == 0) { + // read array element + { + eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + offset_6985 *= 2; + } + while (slt32(skip_waves_6986, squot32(sext_i64_i32(seghist_tblock_sizze_6321) + wave_sizze_6963 - 1, wave_sizze_6963))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_6985 = skip_waves_6986 * wave_sizze_6963; + if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6986 - 1)) == 0)) { + // read array element + { + eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + skip_waves_6986 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); + // thread 0 updates per-block acc(s); rest reset to ne + { + if (sext_i32_i64(local_tid_6961) == (int64_t) 0) { + eta_p_block_res_acc_6977 = eta_p_6978; + } else { + eta_p_block_res_acc_6977 = (int64_t) 0; + } + } + if (blocks_per_segment_6932 == (int64_t) 1) { + // first thread in block saves final result to memory + { + if (local_tid_6961 == 0) { + ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_block_res_acc_6977; + } + } + } else { + int32_t old_counter_6987; + bool is_last_block_6988; + + // first thread in block saves block result to global memory + { + if (local_tid_6961 == 0) { + ((__global int64_t *) segred_tmp_mem_6936)[sext_i32_i64(virt_tblock_id_6972)] = eta_p_block_res_acc_6977; + mem_fence_global(); + old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) 1); + ((__local bool *) sync_arr_mem_6967)[(int64_t) 0] = old_counter_6987 == sext_i64_i32(blocks_per_segment_6932 - (int64_t) 1); + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + is_last_block_6988 = ((__local bool *) sync_arr_mem_6967)[(int64_t) 0]; + if (is_last_block_6988) { + if (local_tid_6961 == 0) { + old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) sext_i64_i32((int64_t) 0 - blocks_per_segment_6932)); + } + // read in the per-block-results + { + int64_t read_per_thread_6989 = sdiv_up64(blocks_per_segment_6932, seghist_tblock_sizze_6321); + + eta_p_6329 = (int64_t) 0; + for (int64_t i_6990 = 0; i_6990 < read_per_thread_6989; i_6990++) { + int64_t block_res_id_6991 = sext_i32_i64(local_tid_6961) * read_per_thread_6989 + i_6990; + int64_t index_of_block_res_6992 = flat_segment_id_6973 * blocks_per_segment_6932 + block_res_id_6991; + + if (slt64(block_res_id_6991, blocks_per_segment_6932)) { + eta_p_6330 = ((__global int64_t *) segred_tmp_mem_6936)[index_of_block_res_6992]; + + int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330); + + eta_p_6329 = defunc_0_op_res_6331; + } + } + } + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6329; + barrier(CLK_LOCAL_MEM_FENCE); + // reduce the per-block results + { + int32_t offset_6993; + int32_t skip_waves_6994 = 1; + int64_t eta_p_6978; + int64_t eta_p_6979; + + offset_6993 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6321))) { + eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)]; + } + } + offset_6993 = 1; + while (slt32(offset_6993, wave_sizze_6963)) { + if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6993 - 1)) == 0) { + // read array element + { + eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + offset_6993 *= 2; + } + while (slt32(skip_waves_6994, squot32(sext_i64_i32(seghist_tblock_sizze_6321) + wave_sizze_6963 - 1, wave_sizze_6963))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_6993 = skip_waves_6994 * wave_sizze_6963; + if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6994 - 1)) == 0)) { + // read array element + { + eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)]; + } + // apply reduction operation + { + int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979); + + eta_p_6978 = defunc_0_op_res_6980; + } + // write result of operation + { + ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978; + } + } + skip_waves_6994 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + // and back to memory with the final result + { + if (local_tid_6961 == 0) { + ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_6978; + } + } + } + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_6: + return; + #undef seghist_tblock_sizze_6321 + #undef chunk_sizze_6902 +} +FUTHARK_KERNEL_SIZED(entropyzisegred_nonseg_6344_dim1, 1, 1) +void entropyzisegred_nonseg_6344(__local uint64_t *shared_mem_aligned, __global int *global_failure, float i64_res_6254, int64_t num_tblocks_6339, int64_t num_threads_7001, __global unsigned char *mem_6758, __global unsigned char *mem_6761, __global unsigned char *counters_mem_6997, __global unsigned char *segred_tmp_mem_6999) +{ + #define segred_tblock_sizze_6337 (entropyzisegred_nonseg_6344zisegred_tblock_sizze_6337) + #define chunk_sizze_6996 (entropyzisegred_nonseg_6344zichunk_sizze_6996) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *sync_arr_mem_7009_backing_1 = &shared_mem[0]; + const int64_t sync_arr_mem_7009_backing_1_offset = 0 + 8; + volatile __local unsigned char *red_arr_f32_mem_7007_backing_0 = &shared_mem[sync_arr_mem_7009_backing_1_offset]; + const int64_t red_arr_f32_mem_7007_backing_0_offset = sync_arr_mem_7009_backing_1_offset + ((int64_t) 4 * segred_tblock_sizze_6337 + srem64((int64_t) 8 - srem64((int64_t) 4 * segred_tblock_sizze_6337, (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_7003; + int32_t tblock_sizze_7006; + int32_t wave_sizze_7005; + int32_t block_id_7004; + int32_t global_tid_7002; + int64_t phys_tid_6344; + __local unsigned char *red_arr_f32_mem_7007; + __local unsigned char *sync_arr_mem_7009; + int64_t dummy_6342; + int64_t gtid_6343; + int64_t q_7011; + float eta_p_block_res_acc_7012; + float eta_p_6266; + float eta_p_6267; + int64_t tblock_id_in_segment_7016; + int64_t block_base_offset_7017; + int32_t offset_7020; + int32_t skip_waves_7021; + float eta_p_7013; + float eta_p_7014; + int32_t old_counter_7022; + bool is_last_block_7023; + + local_tid_7003 = get_local_id(0); + tblock_sizze_7006 = get_local_size(0); + wave_sizze_7005 = LOCKSTEP_WIDTH; + block_id_7004 = get_tblock_id(0); + global_tid_7002 = block_id_7004 * tblock_sizze_7006 + local_tid_7003; + phys_tid_6344 = sext_i32_i64(global_tid_7002); + red_arr_f32_mem_7007 = (__local unsigned char *) red_arr_f32_mem_7007_backing_0; + sync_arr_mem_7009 = (__local unsigned char *) sync_arr_mem_7009_backing_1; + dummy_6342 = (int64_t) 0; + gtid_6343 = (int64_t) 0; + q_7011 = sdiv_up64((int64_t) 256, sext_i32_i64(sext_i64_i32(segred_tblock_sizze_6337 * num_tblocks_6339)) * chunk_sizze_6996); + // ne-initialise the outer (per-block) accumulator(s) + { + eta_p_block_res_acc_7012 = 0.0F; + } + tblock_id_in_segment_7016 = squot64(phys_tid_6344, segred_tblock_sizze_6337); + block_base_offset_7017 = tblock_id_in_segment_7016 * q_7011 * segred_tblock_sizze_6337; + for (int64_t i_7018 = 0; i_7018 < q_7011; i_7018++) { + int64_t block_offset_7019 = block_base_offset_7017 + i_7018 * segred_tblock_sizze_6337; + + gtid_6343 = phys_tid_6344 + num_threads_7001 * i_7018; + if (slt64(gtid_6343, (int64_t) 256)) { + // apply map function(s) + { + // apply map function + { + int64_t eta_p_6282 = ((__global int64_t *) mem_6758)[gtid_6343]; + float i64_res_6283 = sitofp_i64_f32(eta_p_6282); + float lifted_lambda_res_6284 = i64_res_6283 / i64_res_6254; + bool cond_6286 = lifted_lambda_res_6284 == 0.0F; + float lifted_lambda_res_6287; + + if (cond_6286) { + lifted_lambda_res_6287 = 0.0F; + } else { + float log2_res_6288 = futrts_log2_32(lifted_lambda_res_6284); + float lifted_lambda_res_f_res_6289 = lifted_lambda_res_6284 * log2_res_6288; + + lifted_lambda_res_6287 = lifted_lambda_res_f_res_6289; + } + // load accumulator(s) + { + eta_p_6266 = eta_p_block_res_acc_7012; + } + // load next value(s) + { + eta_p_6267 = lifted_lambda_res_6287; + } + // apply reduction operator(s) + { + float defunc_0_op_res_6268 = eta_p_6266 + eta_p_6267; + + // store in accumulator(s) + { + eta_p_block_res_acc_7012 = defunc_0_op_res_6268; + } + } + } + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // store accs. prims go in lmem; non-prims in params (in global mem) + { + ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_block_res_acc_7012; + } + barrier(CLK_LOCAL_MEM_FENCE); + skip_waves_7021 = 1; + offset_7020 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_7003, sext_i64_i32(segred_tblock_sizze_6337))) { + eta_p_7013 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7020)]; + } + } + offset_7020 = 1; + while (slt32(offset_7020, wave_sizze_7005)) { + if (slt32(local_tid_7003 + offset_7020, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) & (2 * offset_7020 - 1)) == 0) { + // read array element + { + eta_p_7014 = ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7020)]; + } + // apply reduction operation + { + float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014; + + eta_p_7013 = defunc_0_op_res_7015; + } + // write result of operation + { + ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013; + } + } + offset_7020 *= 2; + } + while (slt32(skip_waves_7021, squot32(sext_i64_i32(segred_tblock_sizze_6337) + wave_sizze_7005 - 1, wave_sizze_7005))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_7020 = skip_waves_7021 * wave_sizze_7005; + if (slt32(local_tid_7003 + offset_7020, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) == 0 && (squot32(local_tid_7003, wave_sizze_7005) & (2 * skip_waves_7021 - 1)) == 0)) { + // read array element + { + eta_p_7014 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7020)]; + } + // apply reduction operation + { + float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014; + + eta_p_7013 = defunc_0_op_res_7015; + } + // write result of operation + { + ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013; + } + } + skip_waves_7021 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); + // thread 0 updates per-block acc(s); rest reset to ne + { + if (sext_i32_i64(local_tid_7003) == (int64_t) 0) { + eta_p_block_res_acc_7012 = eta_p_7013; + } else { + eta_p_block_res_acc_7012 = 0.0F; + } + } + // first thread in block saves block result to global memory + { + if (local_tid_7003 == 0) { + ((__global float *) segred_tmp_mem_6999)[sext_i32_i64(block_id_7004)] = eta_p_block_res_acc_7012; + mem_fence_global(); + old_counter_7022 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6997)[(int64_t) 0], (int) 1); + ((__local bool *) sync_arr_mem_7009)[(int64_t) 0] = old_counter_7022 == sext_i64_i32(num_tblocks_6339 - (int64_t) 1); + } + } + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + is_last_block_7023 = ((__local bool *) sync_arr_mem_7009)[(int64_t) 0]; + if (is_last_block_7023) { + if (local_tid_7003 == 0) { + old_counter_7022 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6997)[(int64_t) 0], (int) sext_i64_i32((int64_t) 0 - num_tblocks_6339)); + } + // read in the per-block-results + { + int64_t read_per_thread_7024 = sdiv_up64(num_tblocks_6339, segred_tblock_sizze_6337); + + eta_p_6266 = 0.0F; + for (int64_t i_7025 = 0; i_7025 < read_per_thread_7024; i_7025++) { + int64_t block_res_id_7026 = sext_i32_i64(local_tid_7003) * read_per_thread_7024 + i_7025; + int64_t index_of_block_res_7027 = block_res_id_7026; + + if (slt64(block_res_id_7026, num_tblocks_6339)) { + eta_p_6267 = ((__global float *) segred_tmp_mem_6999)[index_of_block_res_7027]; + + float defunc_0_op_res_6268 = eta_p_6266 + eta_p_6267; + + eta_p_6266 = defunc_0_op_res_6268; + } + } + } + ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_6266; + barrier(CLK_LOCAL_MEM_FENCE); + // reduce the per-block results + { + int32_t offset_7028; + int32_t skip_waves_7029 = 1; + float eta_p_7013; + float eta_p_7014; + + offset_7028 = 0; + // participating threads read initial accumulator + { + if (slt32(local_tid_7003, sext_i64_i32(segred_tblock_sizze_6337))) { + eta_p_7013 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7028)]; + } + } + offset_7028 = 1; + while (slt32(offset_7028, wave_sizze_7005)) { + if (slt32(local_tid_7003 + offset_7028, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) & (2 * offset_7028 - 1)) == 0) { + // read array element + { + eta_p_7014 = ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7028)]; + } + // apply reduction operation + { + float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014; + + eta_p_7013 = defunc_0_op_res_7015; + } + // write result of operation + { + ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013; + } + } + offset_7028 *= 2; + } + while (slt32(skip_waves_7029, squot32(sext_i64_i32(segred_tblock_sizze_6337) + wave_sizze_7005 - 1, wave_sizze_7005))) { + barrier(CLK_LOCAL_MEM_FENCE); + offset_7028 = skip_waves_7029 * wave_sizze_7005; + if (slt32(local_tid_7003 + offset_7028, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) == 0 && (squot32(local_tid_7003, wave_sizze_7005) & (2 * skip_waves_7029 - 1)) == 0)) { + // read array element + { + eta_p_7014 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7028)]; + } + // apply reduction operation + { + float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014; + + eta_p_7013 = defunc_0_op_res_7015; + } + // write result of operation + { + ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013; + } + } + skip_waves_7029 *= 2; + } + barrier(CLK_LOCAL_MEM_FENCE); + // and back to memory with the final result + { + if (local_tid_7003 == 0) { + ((__global float *) mem_6761)[(int64_t) 0] = eta_p_7013; + } + } + } + } + + error_5: + return; + #undef segred_tblock_sizze_6337 + #undef chunk_sizze_6996 +} +FUTHARK_KERNEL_SIZED(entropyzisegred_small_6901_dim1, 1, 1) +void entropyzisegred_small_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6323, int64_t num_subhistos_6815, int64_t segment_sizze_nonzzero_6903, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816) +{ + #define seghist_tblock_sizze_6321 (entropyzisegred_small_6901ziseghist_tblock_sizze_6321) + + __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned; + volatile __local unsigned char *red_arr_i64_mem_6910_backing_0 = &shared_mem[0]; + const int64_t red_arr_i64_mem_6910_backing_0_offset = 0 + ((int64_t) 8 * seghist_tblock_sizze_6321 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6321, (int64_t) 8), (int64_t) 8)); + + if (*global_failure >= 0) + return; + + int32_t local_tid_6906; + int32_t tblock_sizze_6909; + int32_t wave_sizze_6908; + int32_t block_id_6907; + int32_t global_tid_6905; + int64_t flat_gtid_6901; + __local unsigned char *red_arr_i64_mem_6910; + int32_t phys_tblock_id_6912; + int32_t iterations_6913; + + local_tid_6906 = get_local_id(0); + tblock_sizze_6909 = get_local_size(0); + wave_sizze_6908 = LOCKSTEP_WIDTH; + block_id_6907 = get_tblock_id(0); + global_tid_6905 = block_id_6907 * tblock_sizze_6909 + local_tid_6906; + flat_gtid_6901 = sext_i32_i64(global_tid_6905); + red_arr_i64_mem_6910 = (__local unsigned char *) red_arr_i64_mem_6910_backing_0; + phys_tblock_id_6912 = get_tblock_id(0); + iterations_6913 = sdiv_up32(sext_i64_i32(sdiv_up64((int64_t) 256, squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903))) - phys_tblock_id_6912, sext_i64_i32(num_tblocks_6323)); + for (int32_t i_6914 = 0; i_6914 < iterations_6913; i_6914++) { + int32_t virt_tblock_id_6915; + int64_t slice_6916; + int64_t bucket_id_6899; + int64_t remnant_6917; + int64_t subhistogram_id_6900; + + virt_tblock_id_6915 = phys_tblock_id_6912 + i_6914 * sext_i64_i32(num_tblocks_6323); + slice_6916 = (int64_t) 256; + bucket_id_6899 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903); + remnant_6917 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903) - bucket_id_6899; + subhistogram_id_6900 = srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815); + // apply map function if in bounds + { + if (slt64((int64_t) 0, num_subhistos_6815) && (slt64(bucket_id_6899, (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903)))) { + // save results to be reduced + { + int64_t tmp_6918 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899]; + + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = tmp_6918; + } + } else { + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = (int64_t) 0; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (slt64((int64_t) 0, num_subhistos_6815)) { + // perform segmented scan to imitate reduction + { + int64_t eta_p_6329; + int64_t eta_p_6330; + int64_t eta_p_6919; + int64_t eta_p_6920; + bool ltid_in_bounds_6922 = slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903)); + int32_t skip_threads_6923; + + // read input for in-block scan + { + if (ltid_in_bounds_6922) { + eta_p_6330 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)]; + if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) { + eta_p_6329 = eta_p_6330; + } + } + } + // in-block scan (hopefully no barriers needed) + { + skip_threads_6923 = 1; + while (slt32(skip_threads_6923, 32)) { + bool thread_active_6924 = sle32(skip_threads_6923, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && ltid_in_bounds_6922; + + if (thread_active_6924) { + // read operands + { + eta_p_6329 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6923)]; + } + } + // perform operation + { + bool inactive_6925 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(local_tid_6906 - skip_threads_6923)); + + if (thread_active_6924 && inactive_6925) { + eta_p_6329 = eta_p_6330; + } + if (thread_active_6924) { + if (!inactive_6925) { + int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330); + + eta_p_6329 = defunc_0_op_res_6331; + } + } + } + if (sle32(wave_sizze_6908, skip_threads_6923)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + if (thread_active_6924) { + // write result + { + ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6329; + eta_p_6330 = eta_p_6329; + } + } + if (sle32(wave_sizze_6908, skip_threads_6923)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + skip_threads_6923 *= 2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // last thread of block 'i' writes its result to offset 'i' + { + if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 31 && ltid_in_bounds_6922) { + ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32))] = eta_p_6329; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // scan the first block, after which offset 'i' contains carry-in for block 'i+1' + { + int32_t skip_threads_6926; + + // read input for in-block scan + { + if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) { + eta_p_6920 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)]; + if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) { + eta_p_6919 = eta_p_6920; + } + } + } + // in-block scan (hopefully no barriers needed) + { + skip_threads_6926 = 1; + while (slt32(skip_threads_6926, 32)) { + bool thread_active_6927 = sle32(skip_threads_6926, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922); + + if (thread_active_6927) { + // read operands + { + eta_p_6919 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6926)]; + } + } + // perform operation + { + bool inactive_6928 = slt64(srem64(sext_i32_i64(local_tid_6906 * 32 + 32 - 1), num_subhistos_6815), sext_i32_i64(local_tid_6906 * 32 + 32 - 1) - sext_i32_i64((local_tid_6906 - skip_threads_6926) * 32 + 32 - 1)); + + if (thread_active_6927 && inactive_6928) { + eta_p_6919 = eta_p_6920; + } + if (thread_active_6927) { + if (!inactive_6928) { + int64_t defunc_0_op_res_6921 = add64(eta_p_6919, eta_p_6920); + + eta_p_6919 = defunc_0_op_res_6921; + } + } + } + if (sle32(wave_sizze_6908, skip_threads_6926)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + if (thread_active_6927) { + // write result + { + ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6919; + eta_p_6920 = eta_p_6919; + } + } + if (sle32(wave_sizze_6908, skip_threads_6926)) { + barrier(CLK_LOCAL_MEM_FENCE); + } + skip_threads_6926 *= 2; + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + bool no_carry_in_6929 = squot32(local_tid_6906, 32) == 0 || !ltid_in_bounds_6922; + + // carry-in for every block except the first + { + // read operands + { + if (!no_carry_in_6929) { + eta_p_6330 = eta_p_6329; + eta_p_6329 = ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32)) - (int64_t) 1]; + } + } + // perform operation + { + bool inactive_6930 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(squot32(local_tid_6906, 32) * 32 - 1)); + + if (!no_carry_in_6929) { + if (inactive_6930) { + eta_p_6329 = eta_p_6330; + } + } + if (!no_carry_in_6929) { + if (!inactive_6930) { + int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330); + + eta_p_6329 = defunc_0_op_res_6331; + } + } + } + // write final result + { + if (!no_carry_in_6929) { + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6329; + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // restore correct values for first block + { + if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) { + ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6330; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // save final values of segments + { + if (slt64(sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906), (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903))) { + int64_t tmp_6931 = ((__local int64_t *) red_arr_i64_mem_6910)[(sext_i32_i64(local_tid_6906) + (int64_t) 1) * segment_sizze_nonzzero_6903 - (int64_t) 1]; + + ((__global int64_t *) mem_6758)[sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906)] = tmp_6931; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); + } + + error_3: + return; + #undef seghist_tblock_sizze_6321 +} +""" +# Start of values.py. + +# Hacky parser/reader/writer for values written in Futhark syntax. +# Used for reading stdin when compiling standalone programs with the +# Python code generator. + +import numpy as np +import struct +import sys + + +class ReaderInput: + def __init__(self, f): + self.f = f + self.lookahead_buffer = [] + + def get_char(self): + if len(self.lookahead_buffer) == 0: + return self.f.read(1) + else: + c = self.lookahead_buffer[0] + self.lookahead_buffer = self.lookahead_buffer[1:] + return c + + def unget_char(self, c): + self.lookahead_buffer = [c] + self.lookahead_buffer + + def get_chars(self, n): + n1 = min(n, len(self.lookahead_buffer)) + s = b"".join(self.lookahead_buffer[:n1]) + self.lookahead_buffer = self.lookahead_buffer[n1:] + n2 = n - n1 + if n2 > 0: + s += self.f.read(n2) + return s + + def peek_char(self): + c = self.get_char() + if c: + self.unget_char(c) + return c + + +def skip_spaces(f): + c = f.get_char() + while c != None: + if c.isspace(): + c = f.get_char() + elif c == b"-": + # May be line comment. + if f.peek_char() == b"-": + # Yes, line comment. Skip to end of line. + while c != b"\n" and c != None: + c = f.get_char() + else: + break + else: + break + if c: + f.unget_char(c) + + +def parse_specific_char(f, expected): + got = f.get_char() + if got != expected: + f.unget_char(got) + raise ValueError + return True + + +def parse_specific_string(f, s): + # This funky mess is intended, and is caused by the fact that if `type(b) == + # bytes` then `type(b[0]) == int`, but we need to match each element with a + # `bytes`, so therefore we make each character an array element + b = s.encode("utf8") + bs = [b[i : i + 1] for i in range(len(b))] + read = [] + try: + for c in bs: + parse_specific_char(f, c) + read.append(c) + return True + except ValueError: + for c in read[::-1]: + f.unget_char(c) + raise + + +def optional(p, *args): + try: + return p(*args) + except ValueError: + return None + + +def optional_specific_string(f, s): + c = f.peek_char() + # This funky mess is intended, and is caused by the fact that if `type(b) == + # bytes` then `type(b[0]) == int`, but we need to match each element with a + # `bytes`, so therefore we make each character an array element + b = s.encode("utf8") + bs = [b[i : i + 1] for i in range(len(b))] + if c == bs[0]: + return parse_specific_string(f, s) + else: + return False + + +def sepBy(p, sep, *args): + elems = [] + x = optional(p, *args) + if x != None: + elems += [x] + while optional(sep, *args) != None: + x = p(*args) + elems += [x] + return elems + + +# Assumes '0x' has already been read +def parse_hex_int(f): + s = b"" + c = f.get_char() + while c != None: + if c in b"01234556789ABCDEFabcdef": + s += c + c = f.get_char() + elif c == b"_": + c = f.get_char() # skip _ + else: + f.unget_char(c) + break + return str(int(s, 16)).encode("utf8") # ugh + + +def parse_int(f): + s = b"" + c = f.get_char() + if c == b"0" and f.peek_char() in b"xX": + c = f.get_char() # skip X + return parse_hex_int(f) + else: + while c != None: + if c.isdigit(): + s += c + c = f.get_char() + elif c == b"_": + c = f.get_char() # skip _ + else: + f.unget_char(c) + break + if len(s) == 0: + raise ValueError + return s + + +def parse_int_signed(f): + s = b"" + c = f.get_char() + + if c == b"-" and f.peek_char().isdigit(): + return c + parse_int(f) + else: + if c != b"+": + f.unget_char(c) + return parse_int(f) + + +def read_str_comma(f): + skip_spaces(f) + parse_specific_char(f, b",") + return b"," + + +def read_str_int(f, s): + skip_spaces(f) + x = int(parse_int_signed(f)) + optional_specific_string(f, s) + return x + + +def read_str_uint(f, s): + skip_spaces(f) + x = int(parse_int(f)) + optional_specific_string(f, s) + return x + + +def read_str_i8(f): + return np.int8(read_str_int(f, "i8")) + + +def read_str_i16(f): + return np.int16(read_str_int(f, "i16")) + + +def read_str_i32(f): + return np.int32(read_str_int(f, "i32")) + + +def read_str_i64(f): + return np.int64(read_str_int(f, "i64")) + + +def read_str_u8(f): + return np.uint8(read_str_int(f, "u8")) + + +def read_str_u16(f): + return np.uint16(read_str_int(f, "u16")) + + +def read_str_u32(f): + return np.uint32(read_str_int(f, "u32")) + + +def read_str_u64(f): + return np.uint64(read_str_int(f, "u64")) + + +def read_char(f): + skip_spaces(f) + parse_specific_char(f, b"'") + c = f.get_char() + parse_specific_char(f, b"'") + return c + + +def read_str_hex_float(f, sign): + int_part = parse_hex_int(f) + parse_specific_char(f, b".") + frac_part = parse_hex_int(f) + parse_specific_char(f, b"p") + exponent = parse_int(f) + + int_val = int(int_part, 16) + frac_val = float(int(frac_part, 16)) / (16 ** len(frac_part)) + exp_val = int(exponent) + + total_val = (int_val + frac_val) * (2.0**exp_val) + if sign == b"-": + total_val = -1 * total_val + + return float(total_val) + + +def read_str_decimal(f): + skip_spaces(f) + c = f.get_char() + if c == b"-": + sign = b"-" + else: + f.unget_char(c) + sign = b"" + + # Check for hexadecimal float + c = f.get_char() + if c == "0" and (f.peek_char() in ["x", "X"]): + f.get_char() + return read_str_hex_float(f, sign) + else: + f.unget_char(c) + + bef = optional(parse_int, f) + if bef == None: + bef = b"0" + parse_specific_char(f, b".") + aft = parse_int(f) + elif optional(parse_specific_char, f, b"."): + aft = parse_int(f) + else: + aft = b"0" + if optional(parse_specific_char, f, b"E") or optional(parse_specific_char, f, b"e"): + expt = parse_int_signed(f) + else: + expt = b"0" + return float(sign + bef + b"." + aft + b"E" + expt) + + +def read_str_f16(f): + skip_spaces(f) + try: + parse_specific_string(f, "f16.nan") + return np.float32(np.nan) + except ValueError: + try: + parse_specific_string(f, "f16.inf") + return np.float32(np.inf) + except ValueError: + try: + parse_specific_string(f, "-f16.inf") + return np.float32(-np.inf) + except ValueError: + x = read_str_decimal(f) + optional_specific_string(f, "f16") + return x + + +def read_str_f32(f): + skip_spaces(f) + try: + parse_specific_string(f, "f32.nan") + return np.float32(np.nan) + except ValueError: + try: + parse_specific_string(f, "f32.inf") + return np.float32(np.inf) + except ValueError: + try: + parse_specific_string(f, "-f32.inf") + return np.float32(-np.inf) + except ValueError: + x = read_str_decimal(f) + optional_specific_string(f, "f32") + return x + + +def read_str_f64(f): + skip_spaces(f) + try: + parse_specific_string(f, "f64.nan") + return np.float64(np.nan) + except ValueError: + try: + parse_specific_string(f, "f64.inf") + return np.float64(np.inf) + except ValueError: + try: + parse_specific_string(f, "-f64.inf") + return np.float64(-np.inf) + except ValueError: + x = read_str_decimal(f) + optional_specific_string(f, "f64") + return x + + +def read_str_bool(f): + skip_spaces(f) + if f.peek_char() == b"t": + parse_specific_string(f, "true") + return True + elif f.peek_char() == b"f": + parse_specific_string(f, "false") + return False + else: + raise ValueError + + +def read_str_empty_array(f, type_name, rank): + parse_specific_string(f, "empty") + parse_specific_char(f, b"(") + dims = [] + for i in range(rank): + parse_specific_string(f, "[") + dims += [int(parse_int(f))] + parse_specific_string(f, "]") + if np.prod(dims) != 0: + raise ValueError + parse_specific_string(f, type_name) + parse_specific_char(f, b")") + + return tuple(dims) + + +def read_str_array_elems(f, elem_reader, type_name, rank): + skip_spaces(f) + try: + parse_specific_char(f, b"[") + except ValueError: + return read_str_empty_array(f, type_name, rank) + else: + xs = sepBy(elem_reader, read_str_comma, f) + skip_spaces(f) + parse_specific_char(f, b"]") + return xs + + +def read_str_array_helper(f, elem_reader, type_name, rank): + def nested_row_reader(_): + return read_str_array_helper(f, elem_reader, type_name, rank - 1) + + if rank == 1: + row_reader = elem_reader + else: + row_reader = nested_row_reader + return read_str_array_elems(f, row_reader, type_name, rank) + + +def expected_array_dims(l, rank): + if rank > 1: + n = len(l) + if n == 0: + elem = [] + else: + elem = l[0] + return [n] + expected_array_dims(elem, rank - 1) + else: + return [len(l)] + + +def verify_array_dims(l, dims): + if dims[0] != len(l): + raise ValueError + if len(dims) > 1: + for x in l: + verify_array_dims(x, dims[1:]) + + +def read_str_array(f, elem_reader, type_name, rank, bt): + elems = read_str_array_helper(f, elem_reader, type_name, rank) + if type(elems) == tuple: + # Empty array + return np.empty(elems, dtype=bt) + else: + dims = expected_array_dims(elems, rank) + verify_array_dims(elems, dims) + return np.array(elems, dtype=bt) + + +################################################################################ + +READ_BINARY_VERSION = 2 + +# struct format specified at +# https://docs.python.org/2/library/struct.html#format-characters + + +def mk_bin_scalar_reader(t): + def bin_reader(f): + fmt = FUTHARK_PRIMTYPES[t]["bin_format"] + size = FUTHARK_PRIMTYPES[t]["size"] + tf = FUTHARK_PRIMTYPES[t]["numpy_type"] + return tf(struct.unpack("<" + fmt, f.get_chars(size))[0]) + + return bin_reader + + +read_bin_i8 = mk_bin_scalar_reader("i8") +read_bin_i16 = mk_bin_scalar_reader("i16") +read_bin_i32 = mk_bin_scalar_reader("i32") +read_bin_i64 = mk_bin_scalar_reader("i64") + +read_bin_u8 = mk_bin_scalar_reader("u8") +read_bin_u16 = mk_bin_scalar_reader("u16") +read_bin_u32 = mk_bin_scalar_reader("u32") +read_bin_u64 = mk_bin_scalar_reader("u64") + +read_bin_f16 = mk_bin_scalar_reader("f16") +read_bin_f32 = mk_bin_scalar_reader("f32") +read_bin_f64 = mk_bin_scalar_reader("f64") + +read_bin_bool = mk_bin_scalar_reader("bool") + + +def read_is_binary(f): + skip_spaces(f) + c = f.get_char() + if c == b"b": + bin_version = read_bin_u8(f) + if bin_version != READ_BINARY_VERSION: + panic( + 1, + "binary-input: File uses version %i, but I only understand version %i.\n", + bin_version, + READ_BINARY_VERSION, + ) + return True + else: + f.unget_char(c) + return False + + +FUTHARK_PRIMTYPES = { + "i8": { + "binname": b" i8", + "size": 1, + "bin_reader": read_bin_i8, + "str_reader": read_str_i8, + "bin_format": "b", + "numpy_type": np.int8, + }, + "i16": { + "binname": b" i16", + "size": 2, + "bin_reader": read_bin_i16, + "str_reader": read_str_i16, + "bin_format": "h", + "numpy_type": np.int16, + }, + "i32": { + "binname": b" i32", + "size": 4, + "bin_reader": read_bin_i32, + "str_reader": read_str_i32, + "bin_format": "i", + "numpy_type": np.int32, + }, + "i64": { + "binname": b" i64", + "size": 8, + "bin_reader": read_bin_i64, + "str_reader": read_str_i64, + "bin_format": "q", + "numpy_type": np.int64, + }, + "u8": { + "binname": b" u8", + "size": 1, + "bin_reader": read_bin_u8, + "str_reader": read_str_u8, + "bin_format": "B", + "numpy_type": np.uint8, + }, + "u16": { + "binname": b" u16", + "size": 2, + "bin_reader": read_bin_u16, + "str_reader": read_str_u16, + "bin_format": "H", + "numpy_type": np.uint16, + }, + "u32": { + "binname": b" u32", + "size": 4, + "bin_reader": read_bin_u32, + "str_reader": read_str_u32, + "bin_format": "I", + "numpy_type": np.uint32, + }, + "u64": { + "binname": b" u64", + "size": 8, + "bin_reader": read_bin_u64, + "str_reader": read_str_u64, + "bin_format": "Q", + "numpy_type": np.uint64, + }, + "f16": { + "binname": b" f16", + "size": 2, + "bin_reader": read_bin_f16, + "str_reader": read_str_f16, + "bin_format": "e", + "numpy_type": np.float16, + }, + "f32": { + "binname": b" f32", + "size": 4, + "bin_reader": read_bin_f32, + "str_reader": read_str_f32, + "bin_format": "f", + "numpy_type": np.float32, + }, + "f64": { + "binname": b" f64", + "size": 8, + "bin_reader": read_bin_f64, + "str_reader": read_str_f64, + "bin_format": "d", + "numpy_type": np.float64, + }, + "bool": { + "binname": b"bool", + "size": 1, + "bin_reader": read_bin_bool, + "str_reader": read_str_bool, + "bin_format": "b", + "numpy_type": bool, + }, +} + + +def read_bin_read_type(f): + read_binname = f.get_chars(4) + + for k, v in FUTHARK_PRIMTYPES.items(): + if v["binname"] == read_binname: + return k + panic(1, "binary-input: Did not recognize the type '%s'.\n", read_binname) + + +def numpy_type_to_type_name(t): + for k, v in FUTHARK_PRIMTYPES.items(): + if v["numpy_type"] == t: + return k + raise Exception(f"Unknown Numpy type: {t}") + + +def read_bin_ensure_scalar(f, expected_type): + dims = read_bin_i8(f) + + if dims != 0: + panic( + 1, + "binary-input: Expected scalar (0 dimensions), but got array with %i dimensions.\n", + dims, + ) + + bin_type = read_bin_read_type(f) + if bin_type != expected_type: + panic( + 1, + "binary-input: Expected scalar of type %s but got scalar of type %s.\n", + expected_type, + bin_type, + ) + + +# ------------------------------------------------------------------------------ +# General interface for reading Primitive Futhark Values +# ------------------------------------------------------------------------------ + + +def read_scalar(f, ty): + if read_is_binary(f): + read_bin_ensure_scalar(f, ty) + return FUTHARK_PRIMTYPES[ty]["bin_reader"](f) + return FUTHARK_PRIMTYPES[ty]["str_reader"](f) + + +def read_array(f, expected_type, rank): + if not read_is_binary(f): + str_reader = FUTHARK_PRIMTYPES[expected_type]["str_reader"] + return read_str_array( + f, + str_reader, + expected_type, + rank, + FUTHARK_PRIMTYPES[expected_type]["numpy_type"], + ) + + bin_rank = read_bin_u8(f) + + if bin_rank != rank: + panic( + 1, + "binary-input: Expected %i dimensions, but got array with %i dimensions.\n", + rank, + bin_rank, + ) + + bin_type_enum = read_bin_read_type(f) + if expected_type != bin_type_enum: + panic( + 1, + "binary-input: Expected %iD-array with element type '%s' but got %iD-array with element type '%s'.\n", + rank, + expected_type, + bin_rank, + bin_type_enum, + ) + + shape = [] + elem_count = 1 + for i in range(rank): + bin_size = read_bin_i64(f) + elem_count *= bin_size + shape.append(bin_size) + + bin_fmt = FUTHARK_PRIMTYPES[bin_type_enum]["bin_format"] + + # We first read the expected number of types into a bytestring, + # then use np.frombuffer. This is because np.fromfile does not + # work on things that are insufficiently file-like, like a network + # stream. + bytes = f.get_chars(elem_count * FUTHARK_PRIMTYPES[expected_type]["size"]) + arr = np.frombuffer(bytes, dtype=FUTHARK_PRIMTYPES[bin_type_enum]["numpy_type"]) + arr.shape = shape + + return arr.copy() # To ensure it is writeable. + + +input_reader = ReaderInput(sys.stdin.buffer) + +import re + + +def read_value(type_desc, reader=input_reader): + """Read a value of the given type. The type is a string + representation of the Futhark type.""" + m = re.match(r"((?:\[\])*)([a-z0-9]+)$", type_desc) + if m: + dims = int(len(m.group(1)) / 2) + basetype = m.group(2) + assert m and basetype in FUTHARK_PRIMTYPES, f"Unknown type: {type_desc}" + if dims > 0: + return read_array(reader, basetype, dims) + else: + return read_scalar(reader, basetype) + + +def end_of_input(entry, f=input_reader): + skip_spaces(f) + if f.get_char() != b"": + panic(1, 'Expected EOF on stdin after reading input for "%s".', entry) + + +def write_value_text(v, out=sys.stdout): + if type(v) == np.uint8: + out.write("%uu8" % v) + elif type(v) == np.uint16: + out.write("%uu16" % v) + elif type(v) == np.uint32: + out.write("%uu32" % v) + elif type(v) == np.uint64: + out.write("%uu64" % v) + elif type(v) == np.int8: + out.write("%di8" % v) + elif type(v) == np.int16: + out.write("%di16" % v) + elif type(v) == np.int32: + out.write("%di32" % v) + elif type(v) == np.int64: + out.write("%di64" % v) + elif type(v) in [bool, np.bool_]: + if v: + out.write("true") + else: + out.write("false") + elif type(v) == np.float16: + if np.isnan(v): + out.write("f16.nan") + elif np.isinf(v): + if v >= 0: + out.write("f16.inf") + else: + out.write("-f16.inf") + else: + out.write("%.6ff16" % v) + elif type(v) == np.float32: + if np.isnan(v): + out.write("f32.nan") + elif np.isinf(v): + if v >= 0: + out.write("f32.inf") + else: + out.write("-f32.inf") + else: + out.write("%.6ff32" % v) + elif type(v) == np.float64: + if np.isnan(v): + out.write("f64.nan") + elif np.isinf(v): + if v >= 0: + out.write("f64.inf") + else: + out.write("-f64.inf") + else: + out.write("%.6ff64" % v) + elif type(v) == np.ndarray: + if np.prod(v.shape) == 0: + tname = numpy_type_to_type_name(v.dtype) + out.write("empty({}{})".format("".join([f"[{d}]" for d in v.shape]), tname)) + else: + first = True + out.write("[") + for x in v: + if not first: + out.write(", ") + first = False + write_value(x, out=out) + out.write("]") + else: + raise Exception(f"Cannot print value of type {type(v)}: {v}") + + +type_strs = { + np.dtype("int8"): b" i8", + np.dtype("int16"): b" i16", + np.dtype("int32"): b" i32", + np.dtype("int64"): b" i64", + np.dtype("uint8"): b" u8", + np.dtype("uint16"): b" u16", + np.dtype("uint32"): b" u32", + np.dtype("uint64"): b" u64", + np.dtype("float16"): b" f16", + np.dtype("float32"): b" f32", + np.dtype("float64"): b" f64", + np.dtype("bool"): b"bool", +} + + +def construct_binary_value(v): + t = v.dtype + shape = v.shape + + elems = 1 + for d in shape: + elems *= d + + num_bytes = 1 + 1 + 1 + 4 + len(shape) * 8 + elems * t.itemsize + bytes = bytearray(num_bytes) + bytes[0] = np.int8(ord("b")) + bytes[1] = 2 + bytes[2] = np.int8(len(shape)) + bytes[3:7] = type_strs[t] + + for i in range(len(shape)): + bytes[7 + i * 8 : 7 + (i + 1) * 8] = np.int64(shape[i]).tobytes() + + bytes[7 + len(shape) * 8 :] = np.ascontiguousarray(v).tobytes() + + return bytes + + +def write_value_binary(v, out=sys.stdout): + out = out.buffer + out.write(construct_binary_value(v)) + + +def write_value(v, out=sys.stdout, binary=False): + if binary: + return write_value_binary(v, out=out) + else: + return write_value_text(v, out=out) + + +# End of values.py. +# Start of memory.py. + +import ctypes as ct + + +def allocateMem(size): + return np.empty(size, dtype=np.byte) + + +# Copy an array if its is not-None. This is important for treating +# Numpy arrays as flat memory, but has some overhead. +def normaliseArray(x): + if (x.base is x) or (x.base is None): + return x + else: + return x.copy() + + +def unwrapArray(x): + return x.ravel().view(np.byte) + + +def indexArray(x, offset, bt): + return x.view(bt)[offset] + + +def writeScalarArray(x, offset, v): + x.view(type(v))[offset] = v + + +# An opaque Futhark value. +class opaque: + def __init__(self, desc, *payload): + self.data = payload + self.desc = desc + + def __repr__(self): + return f"" + + +# LMAD stuff + + +def lmad_contiguous_search(checked, expected, strides, shape, used): + for i in range(len(strides)): + for j in range(len(strides)): + if not used[j] and strides[j] == expected and strides[j] >= 0: + used[j] = True + if checked + 1 == len(strides) or lmad_contiguous_search( + checked + 1, expected * shape[j], strides, shape, used + ): + return True + used[j] = False + return False + + +def lmad_contiguous(strides, shape): + used = len(strides) * [False] + return lmad_contiguous_search(0, 1, strides, shape, used) + + +def lmad_memcpyable(dst_strides, src_strides, shape): + if not lmad_contiguous(dst_strides, shape): + return False + for i in range(len(dst_strides)): + if dst_strides[i] != src_strides[i] and shape[i] != 1: + return False + return True + + +def lmad_is_tr(strides, shape): + r = len(shape) + for i in range(1, r): + n = 1 + m = 1 + ok = True + expected = 1 + # Check strides before 'i'. + for j in range(i - 1, -1, -1): + ok = ok and strides[j] == expected + expected *= shape[j] + n *= shape[j] + # Check strides after 'i'. + for j in range(r - 1, i - 1, -1): + ok = ok and strides[j] == expected + expected *= shape[j] + m *= shape[j] + if ok: + return (n, m) + return None + + +def lmad_map_tr(dst_strides, src_strides, shape): + r = len(dst_strides) + rowmajor_strides = [0] * r + rowmajor_strides[r - 1] = 1 + + for i in range(r - 2, -1, -1): + rowmajor_strides[i] = rowmajor_strides[i + 1] * shape[i + 1] + + # map_r will be the number of mapped dimensions on top. + map_r = 0 + k = 1 + for i in range(r): + if dst_strides[i] != rowmajor_strides[i] or src_strides[i] != rowmajor_strides[i]: + break + else: + k *= shape[i] + map_r += 1 + + if rowmajor_strides[map_r:] == dst_strides[map_r:]: + r = lmad_is_tr(src_strides[map_r:], shape[map_r:]) + if r is not None: + (n, m) = r + return (k, n, m) + elif rowmajor_strides[map_r:] == src_strides[map_r:]: + r = lmad_is_tr(dst_strides[map_r:], shape[map_r:]) + if r is not None: + (n, m) = r + return (k, m, n) # Sic! + return None + + +def lmad_copy_elements(pt, dst, dst_offset, dst_strides, src, src_offset, src_strides, shape): + if len(shape) == 1: + for i in range(shape[0]): + writeScalarArray( + dst, + dst_offset + i * dst_strides[0], + indexArray(src, src_offset + i * src_strides[0], pt), + ) + else: + for i in range(shape[0]): + lmad_copy_elements( + pt, + dst, + dst_offset + i * dst_strides[0], + dst_strides[1:], + src, + src_offset + i * src_strides[0], + src_strides[1:], + shape[1:], + ) + + +def lmad_copy(pt, dst, dst_offset, dst_strides, src, src_offset, src_strides, shape): + if lmad_memcpyable(dst_strides, src_strides, shape): + dst[ + dst_offset * ct.sizeof(pt) : dst_offset * ct.sizeof(pt) + np.prod(shape) * ct.sizeof(pt) + ] = src[ + src_offset * ct.sizeof(pt) : src_offset * ct.sizeof(pt) + np.prod(shape) * ct.sizeof(pt) + ] + else: + lmad_copy_elements( + pt, + dst, + dst_offset, + dst_strides, + src, + src_offset, + src_strides, + shape, + ) + + +# End of memory.py. +# Start of panic.py. + + +def panic(exitcode, fmt, *args): + sys.stderr.write("%s: " % sys.argv[0]) + sys.stderr.write(fmt % args) + sys.stderr.write("\n") + sys.exit(exitcode) + + +# End of panic.py. +# Start of tuning.py + + +def read_tuning_file(kvs, f): + for line in f.read().splitlines(): + size, value = line.split("=") + kvs[size] = int(value) + return kvs + + +# End of tuning.py. +# Start of scalar.py. + +import numpy as np +import math +import struct + + +def intlit(t, x): + if t == np.int8: + return np.int8(x) + elif t == np.int16: + return np.int16(x) + elif t == np.int32: + return np.int32(x) + else: + return np.int64(x) + + +def signed(x): + if type(x) == np.uint8: + return np.int8(x) + elif type(x) == np.uint16: + return np.int16(x) + elif type(x) == np.uint32: + return np.int32(x) + else: + return np.int64(x) + + +def unsigned(x): + if type(x) == np.int8: + return np.uint8(x) + elif type(x) == np.int16: + return np.uint16(x) + elif type(x) == np.int32: + return np.uint32(x) + else: + return np.uint64(x) + + +def shlN(x, y): + return x << y + + +def ashrN(x, y): + return x >> y + + +# Python is so slow that we just make all the unsafe operations safe, +# always. + + +def sdivN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return x // y + + +def sdiv_upN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return (x + y - intlit(type(x), 1)) // y + + +def smodN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return x % y + + +def udivN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return signed(unsigned(x) // unsigned(y)) + + +def udiv_upN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return signed((unsigned(x) + unsigned(y) - unsigned(intlit(type(x), 1))) // unsigned(y)) + + +def umodN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return signed(unsigned(x) % unsigned(y)) + + +def squotN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return np.floor_divide(np.abs(x), np.abs(y)) * np.sign(x) * np.sign(y) + + +def sremN(x, y): + if y == 0: + return intlit(type(x), 0) + else: + return np.remainder(np.abs(x), np.abs(y)) * np.sign(x) + + +def sminN(x, y): + return min(x, y) + + +def smaxN(x, y): + return max(x, y) + + +def uminN(x, y): + return signed(min(unsigned(x), unsigned(y))) + + +def umaxN(x, y): + return signed(max(unsigned(x), unsigned(y))) + + +def fminN(x, y): + return np.fmin(x, y) + + +def fmaxN(x, y): + return np.fmax(x, y) + + +def powN(x, y): + return x**y + + +def fpowN(x, y): + return x**y + + +def sleN(x, y): + return x <= y + + +def sltN(x, y): + return x < y + + +def uleN(x, y): + return unsigned(x) <= unsigned(y) + + +def ultN(x, y): + return unsigned(x) < unsigned(y) + + +def lshr8(x, y): + return np.int8(np.uint8(x) >> np.uint8(y)) + + +def lshr16(x, y): + return np.int16(np.uint16(x) >> np.uint16(y)) + + +def lshr32(x, y): + return np.int32(np.uint32(x) >> np.uint32(y)) + + +def lshr64(x, y): + return np.int64(np.uint64(x) >> np.uint64(y)) + + +def sext_T_i8(x): + return np.int8(x) + + +def sext_T_i16(x): + return np.int16(x) + + +def sext_T_i32(x): + return np.int32(x) + + +def sext_T_i64(x): + return np.int64(x) + + +def itob_T_bool(x): + return bool(x) + + +def btoi_bool_i8(x): + return np.int8(x) + + +def btoi_bool_i16(x): + return np.int16(x) + + +def btoi_bool_i32(x): + return np.int32(x) + + +def btoi_bool_i64(x): + return np.int64(x) + + +def ftob_T_bool(x): + return bool(x) + + +def btof_bool_f16(x): + return np.float16(x) + + +def btof_bool_f32(x): + return np.float32(x) + + +def btof_bool_f64(x): + return np.float64(x) + + +def zext_i8_i8(x): + return np.int8(np.uint8(x)) + + +def zext_i8_i16(x): + return np.int16(np.uint8(x)) + + +def zext_i8_i32(x): + return np.int32(np.uint8(x)) + + +def zext_i8_i64(x): + return np.int64(np.uint8(x)) + + +def zext_i16_i8(x): + return np.int8(np.uint16(x)) + + +def zext_i16_i16(x): + return np.int16(np.uint16(x)) + + +def zext_i16_i32(x): + return np.int32(np.uint16(x)) + + +def zext_i16_i64(x): + return np.int64(np.uint16(x)) + + +def zext_i32_i8(x): + return np.int8(np.uint32(x)) + + +def zext_i32_i16(x): + return np.int16(np.uint32(x)) + + +def zext_i32_i32(x): + return np.int32(np.uint32(x)) + + +def zext_i32_i64(x): + return np.int64(np.uint32(x)) + + +def zext_i64_i8(x): + return np.int8(np.uint64(x)) + + +def zext_i64_i16(x): + return np.int16(np.uint64(x)) + + +def zext_i64_i32(x): + return np.int32(np.uint64(x)) + + +def zext_i64_i64(x): + return np.int64(np.uint64(x)) + + +sdiv8 = sdiv16 = sdiv32 = sdiv64 = sdivN +sdiv_up8 = sdiv1_up6 = sdiv_up32 = sdiv_up64 = sdiv_upN +sdiv_safe8 = sdiv1_safe6 = sdiv_safe32 = sdiv_safe64 = sdivN +sdiv_up_safe8 = sdiv_up1_safe6 = sdiv_up_safe32 = sdiv_up_safe64 = sdiv_upN +smod8 = smod16 = smod32 = smod64 = smodN +smod_safe8 = smod_safe16 = smod_safe32 = smod_safe64 = smodN +udiv8 = udiv16 = udiv32 = udiv64 = udivN +udiv_up8 = udiv_up16 = udiv_up32 = udiv_up64 = udivN +udiv_safe8 = udiv_safe16 = udiv_safe32 = udiv_safe64 = udiv_upN +udiv_up_safe8 = udiv_up_safe16 = udiv_up_safe32 = udiv_up_safe64 = udiv_upN +umod8 = umod16 = umod32 = umod64 = umodN +umod_safe8 = umod_safe16 = umod_safe32 = umod_safe64 = umodN +squot8 = squot16 = squot32 = squot64 = squotN +squot_safe8 = squot_safe16 = squot_safe32 = squot_safe64 = squotN +srem8 = srem16 = srem32 = srem64 = sremN +srem_safe8 = srem_safe16 = srem_safe32 = srem_safe64 = sremN + +shl8 = shl16 = shl32 = shl64 = shlN +ashr8 = ashr16 = ashr32 = ashr64 = ashrN +smax8 = smax16 = smax32 = smax64 = smaxN +smin8 = smin16 = smin32 = smin64 = sminN +umax8 = umax16 = umax32 = umax64 = umaxN +umin8 = umin16 = umin32 = umin64 = uminN +pow8 = pow16 = pow32 = pow64 = powN +fpow16 = fpow32 = fpow64 = fpowN +fmax16 = fmax32 = fmax64 = fmaxN +fmin16 = fmin32 = fmin64 = fminN +sle8 = sle16 = sle32 = sle64 = sleN +slt8 = slt16 = slt32 = slt64 = sltN +ule8 = ule16 = ule32 = ule64 = uleN +ult8 = ult16 = ult32 = ult64 = ultN +sext_i8_i8 = sext_i16_i8 = sext_i32_i8 = sext_i64_i8 = sext_T_i8 +sext_i8_i16 = sext_i16_i16 = sext_i32_i16 = sext_i64_i16 = sext_T_i16 +sext_i8_i32 = sext_i16_i32 = sext_i32_i32 = sext_i64_i32 = sext_T_i32 +sext_i8_i64 = sext_i16_i64 = sext_i32_i64 = sext_i64_i64 = sext_T_i64 +itob_i8_bool = itob_i16_bool = itob_i32_bool = itob_i64_bool = itob_T_bool +ftob_f16_bool = ftob_f32_bool = ftob_f64_bool = ftob_T_bool + + +def clz_T(x): + n = np.int32(0) + bits = x.itemsize * 8 + for i in range(bits): + if x < 0: + break + n += np.int32(1) + x <<= np.int8(1) + return n + + +def ctz_T(x): + n = np.int32(0) + bits = x.itemsize * 8 + for i in range(bits): + if (x & 1) == 1: + break + n += np.int32(1) + x >>= np.int8(1) + return n + + +def popc_T(x): + c = np.int32(0) + while x != 0: + x &= x - np.int8(1) + c += np.int32(1) + return c + + +futhark_popc8 = futhark_popc16 = futhark_popc32 = futhark_popc64 = popc_T +futhark_clzz8 = futhark_clzz16 = futhark_clzz32 = futhark_clzz64 = clz_T +futhark_ctzz8 = futhark_ctzz16 = futhark_ctzz32 = futhark_ctzz64 = ctz_T + + +def ssignum(x): + return np.sign(x) + + +def usignum(x): + if x < 0: + return ssignum(-x) + else: + return ssignum(x) + + +def sitofp_T_f32(x): + return np.float32(x) + + +sitofp_i8_f32 = sitofp_i16_f32 = sitofp_i32_f32 = sitofp_i64_f32 = sitofp_T_f32 + + +def sitofp_T_f64(x): + return np.float64(x) + + +sitofp_i8_f64 = sitofp_i16_f64 = sitofp_i32_f64 = sitofp_i64_f64 = sitofp_T_f64 + + +def uitofp_T_f32(x): + return np.float32(unsigned(x)) + + +uitofp_i8_f32 = uitofp_i16_f32 = uitofp_i32_f32 = uitofp_i64_f32 = uitofp_T_f32 + + +def uitofp_T_f64(x): + return np.float64(unsigned(x)) + + +uitofp_i8_f64 = uitofp_i16_f64 = uitofp_i32_f64 = uitofp_i64_f64 = uitofp_T_f64 + + +def fptosi_T_i8(x): + if np.isnan(x) or np.isinf(x): + return np.int8(0) + else: + return np.int8(np.trunc(x)) + + +fptosi_f16_i8 = fptosi_f32_i8 = fptosi_f64_i8 = fptosi_T_i8 + + +def fptosi_T_i16(x): + if np.isnan(x) or np.isinf(x): + return np.int16(0) + else: + return np.int16(np.trunc(x)) + + +fptosi_f16_i16 = fptosi_f32_i16 = fptosi_f64_i16 = fptosi_T_i16 + + +def fptosi_T_i32(x): + if np.isnan(x) or np.isinf(x): + return np.int32(0) + else: + return np.int32(np.trunc(x)) + + +fptosi_f16_i32 = fptosi_f32_i32 = fptosi_f64_i32 = fptosi_T_i32 + + +def fptosi_T_i64(x): + if np.isnan(x) or np.isinf(x): + return np.int64(0) + else: + return np.int64(np.trunc(x)) + + +fptosi_f16_i64 = fptosi_f32_i64 = fptosi_f64_i64 = fptosi_T_i64 + + +def fptoui_T_i8(x): + if np.isnan(x) or np.isinf(x): + return np.int8(0) + else: + return np.int8(np.trunc(x)) + + +fptoui_f16_i8 = fptoui_f32_i8 = fptoui_f64_i8 = fptoui_T_i8 + + +def fptoui_T_i16(x): + if np.isnan(x) or np.isinf(x): + return np.int16(0) + else: + return np.int16(np.trunc(x)) + + +fptoui_f16_i16 = fptoui_f32_i16 = fptoui_f64_i16 = fptoui_T_i16 + + +def fptoui_T_i32(x): + if np.isnan(x) or np.isinf(x): + return np.int32(0) + else: + return np.int32(np.trunc(x)) + + +fptoui_f16_i32 = fptoui_f32_i32 = fptoui_f64_i32 = fptoui_T_i32 + + +def fptoui_T_i64(x): + if np.isnan(x) or np.isinf(x): + return np.int64(0) + else: + return np.int64(np.trunc(x)) + + +fptoui_f16_i64 = fptoui_f32_i64 = fptoui_f64_i64 = fptoui_T_i64 + + +def fpconv_f16_f32(x): + return np.float32(x) + + +def fpconv_f16_f64(x): + return np.float64(x) + + +def fpconv_f32_f16(x): + return np.float16(x) + + +def fpconv_f32_f64(x): + return np.float64(x) + + +def fpconv_f64_f16(x): + return np.float16(x) + + +def fpconv_f64_f32(x): + return np.float32(x) + + +def futhark_umul_hi8(a, b): + return np.int8((np.uint64(np.uint8(a)) * np.uint64(np.uint8(b))) >> np.uint64(8)) + + +def futhark_umul_hi16(a, b): + return np.int16((np.uint64(np.uint16(a)) * np.uint64(np.uint16(b))) >> np.uint64(16)) + + +def futhark_umul_hi32(a, b): + return np.int32((np.uint64(np.uint32(a)) * np.uint64(np.uint32(b))) >> np.uint64(32)) + + +def futhark_umul_hi64(a, b): + return np.int64(np.uint64(int(np.uint64(a)) * int(np.uint64(b)) >> 64)) + + +def futhark_smul_hi8(a, b): + return np.int8((np.int64(a) * np.int64(b)) >> np.int64(8)) + + +def futhark_smul_hi16(a, b): + return np.int16((np.int64(a) * np.int64(b)) >> np.int64(16)) + + +def futhark_smul_hi32(a, b): + return np.int32((np.int64(a) * np.int64(b)) >> np.int64(32)) + + +def futhark_smul_hi64(a, b): + return np.int64(int(a) * int(b) >> 64) + + +def futhark_umad_hi8(a, b, c): + return futhark_umul_hi8(a, b) + c + + +def futhark_umad_hi16(a, b, c): + return futhark_umul_hi16(a, b) + c + + +def futhark_umad_hi32(a, b, c): + return futhark_umul_hi32(a, b) + c + + +def futhark_umad_hi64(a, b, c): + return futhark_umul_hi64(a, b) + c + + +def futhark_smad_hi8(a, b, c): + return futhark_smul_hi8(a, b) + c + + +def futhark_smad_hi16(a, b, c): + return futhark_smul_hi16(a, b) + c + + +def futhark_smad_hi32(a, b, c): + return futhark_smul_hi32(a, b) + c + + +def futhark_smad_hi64(a, b, c): + return futhark_smul_hi64(a, b) + c + + +def futhark_log64(x): + return np.float64(np.log(x)) + + +def futhark_log2_64(x): + return np.float64(np.log2(x)) + + +def futhark_log10_64(x): + return np.float64(np.log10(x)) + + +def futhark_log1p_64(x): + return np.float64(np.log1p(x)) + + +def futhark_sqrt64(x): + return np.sqrt(x) + + +def futhark_cbrt64(x): + return np.cbrt(x) + + +def futhark_exp64(x): + return np.exp(x) + + +def futhark_cos64(x): + return np.cos(x) + + +def futhark_sin64(x): + return np.sin(x) + + +def futhark_tan64(x): + return np.tan(x) + + +def futhark_acos64(x): + return np.arccos(x) + + +def futhark_asin64(x): + return np.arcsin(x) + + +def futhark_atan64(x): + return np.arctan(x) + + +def futhark_cosh64(x): + return np.cosh(x) + + +def futhark_sinh64(x): + return np.sinh(x) + + +def futhark_tanh64(x): + return np.tanh(x) + + +def futhark_acosh64(x): + return np.arccosh(x) + + +def futhark_asinh64(x): + return np.arcsinh(x) + + +def futhark_atanh64(x): + return np.arctanh(x) + + +def futhark_atan2_64(x, y): + return np.arctan2(x, y) + + +def futhark_hypot64(x, y): + return np.hypot(x, y) + + +def futhark_gamma64(x): + return np.float64(math.gamma(x)) + + +def futhark_lgamma64(x): + return np.float64(math.lgamma(x)) + + +def futhark_erf64(x): + return np.float64(math.erf(x)) + + +def futhark_erfc64(x): + return np.float64(math.erfc(x)) + + +def futhark_round64(x): + return np.round(x) + + +def futhark_ceil64(x): + return np.ceil(x) + + +def futhark_floor64(x): + return np.floor(x) + + +def futhark_nextafter64(x, y): + return np.nextafter(x, y) + + +def futhark_isnan64(x): + return np.isnan(x) + + +def futhark_isinf64(x): + return np.isinf(x) + + +def futhark_to_bits64(x): + s = struct.pack(">d", x) + return np.int64(struct.unpack(">q", s)[0]) + + +def futhark_from_bits64(x): + s = struct.pack(">q", x) + return np.float64(struct.unpack(">d", s)[0]) + + +def futhark_log32(x): + return np.float32(np.log(x)) + + +def futhark_log2_32(x): + return np.float32(np.log2(x)) + + +def futhark_log10_32(x): + return np.float32(np.log10(x)) + + +def futhark_log1p_32(x): + return np.float32(np.log1p(x)) + + +def futhark_sqrt32(x): + return np.float32(np.sqrt(x)) + + +def futhark_cbrt32(x): + return np.float32(np.cbrt(x)) + + +def futhark_exp32(x): + return np.exp(x) + + +def futhark_cos32(x): + return np.cos(x) + + +def futhark_sin32(x): + return np.sin(x) + + +def futhark_tan32(x): + return np.tan(x) + + +def futhark_acos32(x): + return np.arccos(x) + + +def futhark_asin32(x): + return np.arcsin(x) + + +def futhark_atan32(x): + return np.arctan(x) + + +def futhark_cosh32(x): + return np.cosh(x) + + +def futhark_sinh32(x): + return np.sinh(x) + + +def futhark_tanh32(x): + return np.tanh(x) + + +def futhark_acosh32(x): + return np.arccosh(x) + + +def futhark_asinh32(x): + return np.arcsinh(x) + + +def futhark_atanh32(x): + return np.arctanh(x) + + +def futhark_atan2_32(x, y): + return np.arctan2(x, y) + + +def futhark_hypot32(x, y): + return np.hypot(x, y) + + +def futhark_gamma32(x): + return np.float32(math.gamma(x)) + + +def futhark_lgamma32(x): + return np.float32(math.lgamma(x)) + + +def futhark_erf32(x): + return np.float32(math.erf(x)) + + +def futhark_erfc32(x): + return np.float32(math.erfc(x)) + + +def futhark_round32(x): + return np.round(x) + + +def futhark_ceil32(x): + return np.ceil(x) + + +def futhark_floor32(x): + return np.floor(x) + + +def futhark_nextafter32(x, y): + return np.nextafter(x, y) + + +def futhark_isnan32(x): + return np.isnan(x) + + +def futhark_isinf32(x): + return np.isinf(x) + + +def futhark_to_bits32(x): + s = struct.pack(">f", x) + return np.int32(struct.unpack(">l", s)[0]) + + +def futhark_from_bits32(x): + s = struct.pack(">l", x) + return np.float32(struct.unpack(">f", s)[0]) + + +def futhark_log16(x): + return np.float16(np.log(x)) + + +def futhark_log2_16(x): + return np.float16(np.log2(x)) + + +def futhark_log10_16(x): + return np.float16(np.log10(x)) + + +def futhark_log1p_16(x): + return np.float16(np.log1p(x)) + + +def futhark_sqrt16(x): + return np.float16(np.sqrt(x)) + + +def futhark_cbrt16(x): + return np.float16(np.cbrt(x)) + + +def futhark_exp16(x): + return np.exp(x) + + +def futhark_cos16(x): + return np.cos(x) + + +def futhark_sin16(x): + return np.sin(x) + + +def futhark_tan16(x): + return np.tan(x) + + +def futhark_acos16(x): + return np.arccos(x) + + +def futhark_asin16(x): + return np.arcsin(x) + + +def futhark_atan16(x): + return np.arctan(x) + + +def futhark_cosh16(x): + return np.cosh(x) + + +def futhark_sinh16(x): + return np.sinh(x) + + +def futhark_tanh16(x): + return np.tanh(x) + + +def futhark_acosh16(x): + return np.arccosh(x) + + +def futhark_asinh16(x): + return np.arcsinh(x) + + +def futhark_atanh16(x): + return np.arctanh(x) + + +def futhark_atan2_16(x, y): + return np.arctan2(x, y) + + +def futhark_hypot16(x, y): + return np.hypot(x, y) + + +def futhark_gamma16(x): + return np.float16(math.gamma(x)) + + +def futhark_lgamma16(x): + return np.float16(math.lgamma(x)) + + +def futhark_erf16(x): + return np.float16(math.erf(x)) + + +def futhark_erfc16(x): + return np.float16(math.erfc(x)) + + +def futhark_round16(x): + return np.round(x) + + +def futhark_ceil16(x): + return np.ceil(x) + + +def futhark_floor16(x): + return np.floor(x) + + +def futhark_nextafter16(x, y): + return np.nextafter(x, y) + + +def futhark_isnan16(x): + return np.isnan(x) + + +def futhark_isinf16(x): + return np.isinf(x) + + +def futhark_to_bits16(x): + s = struct.pack(">e", x) + return np.int16(struct.unpack(">H", s)[0]) + + +def futhark_from_bits16(x): + s = struct.pack(">H", np.uint16(x)) + return np.float16(struct.unpack(">e", s)[0]) + + +def futhark_lerp16(v0, v1, t): + return v0 + (v1 - v0) * t + + +def futhark_lerp32(v0, v1, t): + return v0 + (v1 - v0) * t + + +def futhark_lerp64(v0, v1, t): + return v0 + (v1 - v0) * t + + +def futhark_ldexp16(x, y): + return np.ldexp(x, y) + + +def futhark_ldexp32(x, y): + return np.ldexp(x, y) + + +def futhark_ldexp64(x, y): + return np.ldexp(x, y) + + +def futhark_mad16(a, b, c): + return a * b + c + + +def futhark_mad32(a, b, c): + return a * b + c + + +def futhark_mad64(a, b, c): + return a * b + c + + +def futhark_fma16(a, b, c): + return a * b + c + + +def futhark_fma32(a, b, c): + return a * b + c + + +def futhark_fma64(a, b, c): + return a * b + c + + +futhark_copysign16 = futhark_copysign32 = futhark_copysign64 = np.copysign + +# End of scalar.py. +# Start of server.py + +import sys +import time +import shlex # For string splitting + + +class Server: + def __init__(self, ctx): + self._ctx = ctx + self._vars = {} + + class Failure(BaseException): + def __init__(self, msg): + self.msg = msg + + def _get_arg(self, args, i): + if i < len(args): + return args[i] + else: + raise self.Failure("Insufficient command args") + + def _get_entry_point(self, entry): + if entry in self._ctx.entry_points: + return self._ctx.entry_points[entry] + else: + raise self.Failure("Unknown entry point: %s" % entry) + + def _check_var(self, vname): + if not vname in self._vars: + raise self.Failure("Unknown variable: %s" % vname) + + def _check_new_var(self, vname): + if vname in self._vars: + raise self.Failure("Variable already exists: %s" % vname) + + def _get_var(self, vname): + self._check_var(vname) + return self._vars[vname] + + def _cmd_inputs(self, args): + entry = self._get_arg(args, 0) + for t in self._get_entry_point(entry)[1]: + print(t) + + def _cmd_outputs(self, args): + entry = self._get_arg(args, 0) + for t in self._get_entry_point(entry)[2]: + print(t) + + def _cmd_dummy(self, args): + pass + + def _cmd_free(self, args): + for vname in args: + self._check_var(vname) + del self._vars[vname] + + def _cmd_rename(self, args): + oldname = self._get_arg(args, 0) + newname = self._get_arg(args, 1) + self._check_var(oldname) + self._check_new_var(newname) + self._vars[newname] = self._vars[oldname] + del self._vars[oldname] + + def _cmd_call(self, args): + entry = self._get_entry_point(self._get_arg(args, 0)) + entry_fname = entry[0] + num_ins = len(entry[1]) + num_outs = len(entry[2]) + exp_len = 1 + num_outs + num_ins + + if len(args) != exp_len: + raise self.Failure("Invalid argument count, expected %d" % exp_len) + + out_vnames = args[1 : num_outs + 1] + + for out_vname in out_vnames: + self._check_new_var(out_vname) + + in_vnames = args[1 + num_outs :] + ins = [self._get_var(in_vname) for in_vname in in_vnames] + + try: + (runtime, vals) = getattr(self._ctx, entry_fname)(*ins) + except Exception as e: + raise self.Failure(str(e)) + + print("runtime: %d" % runtime) + + if num_outs == 1: + self._vars[out_vnames[0]] = vals + else: + for out_vname, val in zip(out_vnames, vals): + self._vars[out_vname] = val + + def _store_val(self, f, value): + # In case we are using the PyOpenCL backend, we first + # need to convert OpenCL arrays to ordinary NumPy + # arrays. We do this in a nasty way. + if isinstance(value, opaque): + for component in value.data: + self._store_val(f, component) + elif ( + isinstance(value, np.number) + or isinstance(value, bool) + or isinstance(value, np.bool_) + or isinstance(value, np.ndarray) + ): + # Ordinary NumPy value. + f.write(construct_binary_value(value)) + else: + # Assuming PyOpenCL array. + f.write(construct_binary_value(value.get())) + + def _cmd_store(self, args): + fname = self._get_arg(args, 0) + + with open(fname, "wb") as f: + for i in range(1, len(args)): + self._store_val(f, self._get_var(args[i])) + + def _restore_val(self, reader, typename): + if typename in self._ctx.opaques: + vs = [] + for t in self._ctx.opaques[typename]: + vs += [read_value(t, reader)] + return opaque(typename, *vs) + else: + return read_value(typename, reader) + + def _cmd_restore(self, args): + if len(args) % 2 == 0: + raise self.Failure("Invalid argument count") + + fname = args[0] + args = args[1:] + + with open(fname, "rb") as f: + reader = ReaderInput(f) + while args != []: + vname = args[0] + typename = args[1] + args = args[2:] + + if vname in self._vars: + raise self.Failure("Variable already exists: %s" % vname) + + try: + self._vars[vname] = self._restore_val(reader, typename) + except ValueError: + raise self.Failure( + "Failed to restore variable %s.\n" + "Possibly malformed data in %s.\n" % (vname, fname) + ) + + skip_spaces(reader) + if reader.get_char() != b"": + raise self.Failure("Expected EOF after reading values") + + def _cmd_types(self, args): + for k in self._ctx.opaques.keys(): + print(k) + + def _cmd_entry_points(self, args): + for k in self._ctx.entry_points.keys(): + print(k) + + _commands = { + "inputs": _cmd_inputs, + "outputs": _cmd_outputs, + "call": _cmd_call, + "restore": _cmd_restore, + "store": _cmd_store, + "free": _cmd_free, + "rename": _cmd_rename, + "clear": _cmd_dummy, + "pause_profiling": _cmd_dummy, + "unpause_profiling": _cmd_dummy, + "report": _cmd_dummy, + "types": _cmd_types, + "entry_points": _cmd_entry_points, + } + + def _process_line(self, line): + lex = shlex.shlex(line) + lex.quotes = '"' + lex.whitespace_split = True + lex.commenters = "" + words = list(lex) + if words == []: + raise self.Failure("Empty line") + else: + cmd = words[0] + args = words[1:] + if cmd in self._commands: + self._commands[cmd](self, args) + else: + raise self.Failure("Unknown command: %s" % cmd) + + def run(self): + while True: + print("%%% OK", flush=True) + line = sys.stdin.readline() + if line == "": + return + try: + self._process_line(line) + except self.Failure as e: + print("%%% FAILURE") + print(e.msg) + + +# End of server.py +class entropy: + entry_points = { + "byte_histogram": ("byte_histogram", ["[]u8"], ["[]i64"]), + "chunked_entropy": ("chunked_entropy", ["i64", "[]u8"], ["[]u8"]), + "entropy": ("entropy", ["[]u8"], ["f32"]), + } + opaques = {} + + def __init__( + self, + build_options=build_options, + command_queue=None, + interactive=False, + platform_pref=preferred_platform, + device_pref=preferred_device, + default_group_size=default_group_size, + default_num_groups=default_num_groups, + default_tile_size=default_tile_size, + default_reg_tile_size=default_reg_tile_size, + default_threshold=default_threshold, + sizes=sizes, + ): + size_heuristics = [ + ("NVIDIA CUDA", cl.device_type.GPU, "lockstep_width", lambda device: np.int32(32)), + ( + "AMD Accelerated Parallel Processing", + cl.device_type.GPU, + "lockstep_width", + lambda device: np.int32(32), + ), + ("", cl.device_type.GPU, "lockstep_width", lambda device: np.int32(1)), + ( + "", + cl.device_type.GPU, + "num_groups", + lambda device: ( + np.int32(4) * device.get_info(getattr(cl.device_info, "MAX_COMPUTE_UNITS")) + ), + ), + ("", cl.device_type.GPU, "group_size", lambda device: np.int32(256)), + ("", cl.device_type.GPU, "tile_size", lambda device: np.int32(16)), + ("", cl.device_type.GPU, "reg_tile_size", lambda device: np.int32(4)), + ("", cl.device_type.GPU, "threshold", lambda device: np.int32(32768)), + ("", cl.device_type.CPU, "lockstep_width", lambda device: np.int32(1)), + ( + "", + cl.device_type.CPU, + "num_groups", + lambda device: device.get_info(getattr(cl.device_info, "MAX_COMPUTE_UNITS")), + ), + ("", cl.device_type.CPU, "group_size", lambda device: np.int32(32)), + ("", cl.device_type.CPU, "tile_size", lambda device: np.int32(4)), + ("", cl.device_type.CPU, "reg_tile_size", lambda device: np.int32(1)), + ( + "", + cl.device_type.CPU, + "threshold", + lambda device: device.get_info(getattr(cl.device_info, "MAX_COMPUTE_UNITS")), + ), + ] + self.global_failure_args_max = 3 + self.failure_msgs = [ + "Index [{}:{}] out of bounds for array of shape [{}].\n-> #0 ofrak_gpu/entropy.fut:13:44-83\n #1 /prelude/functional.fut:9:44-45\n #2 ofrak_gpu/entropy.fut:13:8-93\n #3 ofrak_gpu/entropy.fut:11:1-13:93\n", + "Index [{}:{}] out of bounds for array of shape [{}].\n-> #0 ofrak_gpu/entropy.fut:13:44-83\n #1 /prelude/functional.fut:9:44-45\n #2 ofrak_gpu/entropy.fut:13:8-93\n #3 ofrak_gpu/entropy.fut:11:1-13:93\n", + ] + constants = [ + ( + "entropyzisegred_nonseg_6344_dim1", + lambda: self.sizes["entropy.segred_tblock_size_6336"], + ), + ( + "entropyzisegred_nonseg_6344zisegred_tblock_sizze_6337", + lambda: self.sizes["entropy.segred_tblock_size_6336"], + ), + ("entropyzisegred_nonseg_6344zichunk_sizze_6996", lambda: np.int64(1)), + ( + "entropyzisegred_large_6901_dim1", + lambda: self.sizes["entropy.seghist_tblock_size_6320"], + ), + ( + "entropyzisegred_large_6901ziseghist_tblock_sizze_6321", + lambda: self.sizes["entropy.seghist_tblock_size_6320"], + ), + ("entropyzisegred_large_6901zichunk_sizze_6902", lambda: np.int64(1)), + ( + "entropyzisegred_small_6901_dim1", + lambda: self.sizes["entropy.seghist_tblock_size_6320"], + ), + ( + "entropyzisegred_small_6901ziseghist_tblock_sizze_6321", + lambda: self.sizes["entropy.seghist_tblock_size_6320"], + ), + ( + "entropyziseghist_global_6328_dim1", + lambda: self.sizes["entropy.seghist_tblock_size_6320"], + ), + ( + "entropyziseghist_global_6328ziseghist_tblock_sizze_6321", + lambda: self.sizes["entropy.seghist_tblock_size_6320"], + ), + ("entropyziseghist_local_6328_dim1", lambda: self.max_thread_block_size), + ( + "entropyziseghist_local_6328zimax_tblock_sizze_6825", + lambda: self.max_thread_block_size, + ), + ( + "chunked_entropyzisegmap_6687_dim1", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6456"], + ), + ( + "chunked_entropyzisegmap_6687zisegmap_tblock_sizze_6683", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6456"], + ), + ( + "chunked_entropyzisegred_large_6669_dim1", + lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"], + ), + ( + "chunked_entropyzisegred_large_6669zisegred_tblock_sizze_6663", + lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"], + ), + ("chunked_entropyzisegred_large_6669zichunk_sizze_6833", lambda: np.int64(1)), + ( + "chunked_entropyzisegred_small_6669_dim1", + lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"], + ), + ( + "chunked_entropyzisegred_small_6669zisegred_tblock_sizze_6663", + lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"], + ), + ( + "chunked_entropyzisegmap_6645_dim1", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6523"], + ), + ( + "chunked_entropyzisegmap_6645zisegmap_tblock_sizze_6639", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6523"], + ), + ( + "chunked_entropyzisegmap_6606_dim1", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6577"], + ), + ( + "chunked_entropyzisegmap_6606zisegmap_tblock_sizze_6600", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6577"], + ), + ( + "chunked_entropyzisegmap_6405_dim1", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6350"], + ), + ( + "chunked_entropyzisegmap_6405zisegmap_tblock_sizze_6401", + lambda: self.sizes["chunked_entropy.segmap_tblock_size_6350"], + ), + ( + "byte_histogramzisegred_large_6901_dim1", + lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"], + ), + ( + "byte_histogramzisegred_large_6901ziseghist_tblock_sizze_6305", + lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"], + ), + ("byte_histogramzisegred_large_6901zichunk_sizze_6902", lambda: np.int64(1)), + ( + "byte_histogramzisegred_small_6901_dim1", + lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"], + ), + ( + "byte_histogramzisegred_small_6901ziseghist_tblock_sizze_6305", + lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"], + ), + ( + "byte_histogramziseghist_global_6312_dim1", + lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"], + ), + ( + "byte_histogramziseghist_global_6312ziseghist_tblock_sizze_6305", + lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"], + ), + ("byte_histogramziseghist_local_6312_dim1", lambda: self.max_thread_block_size), + ( + "byte_histogramziseghist_local_6312zimax_tblock_sizze_6825", + lambda: self.max_thread_block_size, + ), + ] + program = initialise_opencl_object( + self, + program_src=fut_opencl_src, + build_options=build_options, + command_queue=command_queue, + interactive=interactive, + platform_pref=platform_pref, + device_pref=device_pref, + default_group_size=default_group_size, + default_num_groups=default_num_groups, + default_tile_size=default_tile_size, + default_reg_tile_size=default_reg_tile_size, + default_threshold=default_threshold, + size_heuristics=size_heuristics, + required_types=["i8", "i32", "i64", "f32", "bool", "unit"], + user_sizes=sizes, + all_sizes={ + "builtin#replicate_i32.tblock_size_6879": { + "class": "thread_block_size", + "value": None, + }, + "builtin#replicate_i64.tblock_size_6803": { + "class": "thread_block_size", + "value": None, + }, + "byte_histogram.seghist_num_tblocks_6306": {"class": "grid_size", "value": None}, + "byte_histogram.seghist_tblock_size_6304": { + "class": "thread_block_size", + "value": None, + }, + "chunked_entropy.segmap_num_tblocks_6525": {"class": "grid_size", "value": None}, + "chunked_entropy.segmap_tblock_size_6350": { + "class": "thread_block_size", + "value": None, + }, + "chunked_entropy.segmap_tblock_size_6456": { + "class": "thread_block_size", + "value": None, + }, + "chunked_entropy.segmap_tblock_size_6523": { + "class": "thread_block_size", + "value": None, + }, + "chunked_entropy.segmap_tblock_size_6577": { + "class": "thread_block_size", + "value": None, + }, + "chunked_entropy.segred_num_tblocks_6475": {"class": "grid_size", "value": None}, + "chunked_entropy.segred_tblock_size_6473": { + "class": "thread_block_size", + "value": None, + }, + "chunked_entropy.suff_outer_par_0": {"class": "threshold(def, )", "value": None}, + "entropy.seghist_num_tblocks_6322": {"class": "grid_size", "value": None}, + "entropy.seghist_tblock_size_6320": {"class": "thread_block_size", "value": None}, + "entropy.segred_num_tblocks_6338": {"class": "grid_size", "value": None}, + "entropy.segred_tblock_size_6336": {"class": "thread_block_size", "value": None}, + }, + constants=constants, + ) + self.builtinzhreplicate_i32zireplicate_6875_var = ( + program.builtinzhreplicate_i32zireplicate_6875 + ) + self.builtinzhreplicate_i64zireplicate_6799_var = ( + program.builtinzhreplicate_i64zireplicate_6799 + ) + self.byte_histogramziseghist_global_6312_var = program.byte_histogramziseghist_global_6312 + self.byte_histogramziseghist_local_6312_var = program.byte_histogramziseghist_local_6312 + self.byte_histogramzisegred_large_6901_var = program.byte_histogramzisegred_large_6901 + self.byte_histogramzisegred_small_6901_var = program.byte_histogramzisegred_small_6901 + self.chunked_entropyzisegmap_6405_var = program.chunked_entropyzisegmap_6405 + self.chunked_entropyzisegmap_6606_var = program.chunked_entropyzisegmap_6606 + self.chunked_entropyzisegmap_6645_var = program.chunked_entropyzisegmap_6645 + self.chunked_entropyzisegmap_6687_var = program.chunked_entropyzisegmap_6687 + self.chunked_entropyzisegred_large_6669_var = program.chunked_entropyzisegred_large_6669 + self.chunked_entropyzisegred_small_6669_var = program.chunked_entropyzisegred_small_6669 + self.entropyziseghist_global_6328_var = program.entropyziseghist_global_6328 + self.entropyziseghist_local_6328_var = program.entropyziseghist_local_6328 + self.entropyzisegred_large_6901_var = program.entropyzisegred_large_6901 + self.entropyzisegred_nonseg_6344_var = program.entropyzisegred_nonseg_6344 + self.entropyzisegred_small_6901_var = program.entropyzisegred_small_6901 + self.constants = {} + self.constants["counters_mem_6938"] = opencl_alloc( + self, np.int64(81920), 'self.constants["counters_mem_6938"]' + ) + self.futhark_builtinzhreplicate_i32( + self.constants["counters_mem_6938"], np.int64(20480), np.int32(0) + ) + self.constants["counters_mem_6938"] = opencl_alloc( + self, np.int64(81920), 'self.constants["counters_mem_6938"]' + ) + self.futhark_builtinzhreplicate_i32( + self.constants["counters_mem_6938"], np.int64(20480), np.int32(0) + ) + self.constants["counters_mem_6997"] = opencl_alloc( + self, np.int64(80), 'self.constants["counters_mem_6997"]' + ) + self.futhark_builtinzhreplicate_i32( + self.constants["counters_mem_6997"], np.int64(20), np.int32(0) + ) + self.constants["counters_mem_6868"] = opencl_alloc( + self, np.int64(81920), 'self.constants["counters_mem_6868"]' + ) + self.futhark_builtinzhreplicate_i32( + self.constants["counters_mem_6868"], np.int64(20480), np.int32(0) + ) + + def futhark_builtinzhreplicate_i32(self, mem_6870, num_elems_6871, val_6872): + replicate_n_6874 = num_elems_6871 + tblock_sizze_6879 = self.sizes["builtin#replicate_i32.tblock_size_6879"] + virt_num_tblocks_6880 = sdiv_up64(replicate_n_6874, tblock_sizze_6879) + num_tblocks_6881 = smin64(virt_num_tblocks_6880, np.int64(1048576)) + if (1 * (np.int64(num_tblocks_6881) * np.int64(tblock_sizze_6879))) != 0: + self.builtinzhreplicate_i32zireplicate_6875_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + ct.c_int64(num_elems_6871), + ct.c_int32(val_6872), + ct.c_int64(replicate_n_6874), + ct.c_int64(virt_num_tblocks_6880), + ct.c_int64(num_tblocks_6881), + mem_6870, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.builtinzhreplicate_i32zireplicate_6875_var, + ((np.int64(num_tblocks_6881) * np.int64(tblock_sizze_6879)),), + (np.int64(tblock_sizze_6879),), + ) + if synchronous: + sync(self) + return () + + def futhark_builtinzhreplicate_i64(self, mem_6794, num_elems_6795, val_6796): + replicate_n_6798 = num_elems_6795 + tblock_sizze_6803 = self.sizes["builtin#replicate_i64.tblock_size_6803"] + virt_num_tblocks_6804 = sdiv_up64(replicate_n_6798, tblock_sizze_6803) + num_tblocks_6805 = smin64(virt_num_tblocks_6804, np.int64(1048576)) + if (1 * (np.int64(num_tblocks_6805) * np.int64(tblock_sizze_6803))) != 0: + self.builtinzhreplicate_i64zireplicate_6799_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + ct.c_int64(num_elems_6795), + ct.c_int64(val_6796), + ct.c_int64(replicate_n_6798), + ct.c_int64(virt_num_tblocks_6804), + ct.c_int64(num_tblocks_6805), + mem_6794, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.builtinzhreplicate_i64zireplicate_6799_var, + ((np.int64(num_tblocks_6805) * np.int64(tblock_sizze_6803)),), + (np.int64(tblock_sizze_6803),), + ) + if synchronous: + sync(self) + return () + + def futhark_entry_byte_histogram(self, xs_mem_6757, n_5765): + mem_6758 = opencl_alloc(self, np.int64(2048), "mem_6758") + self.futhark_builtinzhreplicate_i64(mem_6758, np.int64(256), np.int64(0)) + seghist_tblock_sizze_6305 = self.sizes["byte_histogram.seghist_tblock_size_6304"] + max_num_tblocks_6814 = self.sizes["byte_histogram.seghist_num_tblocks_6306"] + num_tblocks_6307 = sext_i64_i32( + smax64( + np.int64(1), + smin64(sdiv_up64(n_5765, seghist_tblock_sizze_6305), max_num_tblocks_6814), + ) + ) + h_6818 = np.int64(2048) + seg_h_6819 = np.int64(2048) + if seg_h_6819 == np.int64(0): + pass + else: + hist_H_6820 = np.int64(256) + hist_el_sizze_6821 = sdiv_up64(h_6818, hist_H_6820) + hist_N_6822 = n_5765 + hist_RF_6823 = np.int32(1) + hist_L_6824 = self.max_shared_memory + max_tblock_sizze_6825 = self.max_thread_block_size + num_tblocks_6826 = sdiv_up64( + sext_i32_i64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)), + max_tblock_sizze_6825, + ) + hist_m_prime_6827 = sitofp_i64_f64( + smin64( + squot64(hist_L_6824, hist_el_sizze_6821), + sdiv_up64(hist_N_6822, num_tblocks_6826), + ) + ) / sitofp_i64_f64(hist_H_6820) + hist_M0_6828 = smax64( + np.int64(1), smin64(fptosi_f64_i64(hist_m_prime_6827), max_tblock_sizze_6825) + ) + hist_Nout_6829 = np.int64(1) + hist_Nin_6830 = n_5765 + work_asymp_M_max_6831 = squot64( + (hist_Nout_6829 * hist_N_6822), ((np.int64(2) * num_tblocks_6826) * hist_H_6820) + ) + hist_M_6832 = sext_i64_i32(smin64(hist_M0_6828, work_asymp_M_max_6831)) + hist_C_6833 = sdiv_up64( + max_tblock_sizze_6825, sext_i32_i64(smax32(np.int32(1), hist_M_6832)) + ) + local_mem_needed_6834 = hist_el_sizze_6821 * sext_i32_i64(hist_M_6832) + hist_S_6835 = sext_i64_i32( + sdiv_up64(((hist_H_6820 * local_mem_needed_6834) + np.int64(1)), hist_L_6824) + ) + if sle64(hist_H_6820, hist_Nin_6830) and ( + sle64(local_mem_needed_6834, hist_L_6824) + and ( + sle32(hist_S_6835, np.int32(3)) + and ( + sle64(hist_C_6833, max_tblock_sizze_6825) + and slt32(np.int32(0), hist_M_6832) + ) + ) + ): + num_subhistos_6815 = num_tblocks_6826 + if num_subhistos_6815 == np.int64(1): + defunc_0_map_res_subhistos_mem_6816 = mem_6758 + else: + defunc_0_map_res_subhistos_mem_6816 = opencl_alloc( + self, + ((num_subhistos_6815 * np.int64(256)) * np.int64(8)), + "defunc_0_map_res_subhistos_mem_6816", + ) + self.futhark_builtinzhreplicate_i64( + defunc_0_map_res_subhistos_mem_6816, + (num_subhistos_6815 * np.int64(256)), + np.int64(0), + ) + lmad_copy_gpu2gpu( + self, + ct.c_int64, + defunc_0_map_res_subhistos_mem_6816, + np.int64(0), + [np.int64(1)], + mem_6758, + np.int64(0), + [np.int64(1)], + [np.int64(256)], + ) + chk_i_6836 = np.int32(0) + one_7031 = np.int32(1) + for counter_7030 in range(hist_S_6835): + num_segments_6837 = np.int64(1) + hist_H_chk_6838 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6835)) + histo_sizze_6839 = hist_H_chk_6838 + init_per_thread_6840 = sext_i64_i32( + sdiv_up64( + (sext_i32_i64(hist_M_6832) * histo_sizze_6839), max_tblock_sizze_6825 + ) + ) + if (1 * (np.int64(num_tblocks_6826) * self.max_thread_block_size)) != 0: + self.byte_histogramziseghist_local_6312_var.set_args( + cl.LocalMemory( + max( + ( + (np.int64(8) * (hist_M_6832 * hist_H_chk_6838)) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(8) * (hist_M_6832 * hist_H_chk_6838)), + np.int64(8), + ) + ), + np.int64(8), + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(n_5765), + ct.c_int64(num_subhistos_6815), + ct.c_int64(num_tblocks_6826), + ct.c_int32(hist_M_6832), + ct.c_int32(chk_i_6836), + ct.c_int64(num_segments_6837), + ct.c_int64(hist_H_chk_6838), + ct.c_int64(histo_sizze_6839), + ct.c_int32(init_per_thread_6840), + xs_mem_6757, + defunc_0_map_res_subhistos_mem_6816, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.byte_histogramziseghist_local_6312_var, + ((np.int64(num_tblocks_6826) * self.max_thread_block_size),), + (self.max_thread_block_size,), + ) + if synchronous: + sync(self) + chk_i_6836 += one_7031 + else: + hist_H_6872 = np.int64(256) + hist_RF_6873 = (np.float64(0.0) + sitofp_i32_f64(np.int64(1))) / np.float64(1.0) + hist_el_sizze_6874 = np.int32(8) + hist_C_max_6875 = fmin64( + sitofp_i32_f64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)), + (sitofp_i32_f64(hist_H_6872) / np.float64(2.0)), + ) + hist_M_min_6876 = smax32( + np.int32(1), + sext_i64_i32( + fptosi_f64_i64( + sitofp_i32_f64( + sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305) + ) + / hist_C_max_6875 + ) + ), + ) + L2_sizze_6877 = self.max_cache + hist_RACE_exp_6878 = fmax64( + np.float64(1.0), + ( + (np.float64(0.75) * hist_RF_6873) + / (np.float64(64.0) / sitofp_i32_f64(hist_el_sizze_6874)) + ), + ) + if slt64(n_5765, hist_H_6872): + hist_S_6879 = np.int32(1) + else: + hist_S_6879 = sext_i64_i32( + sdiv_up64( + ( + (sext_i32_i64(hist_M_min_6876) * hist_H_6872) + * sext_i32_i64(hist_el_sizze_6874) + ), + fptosi_f64_i64( + (np.float64(0.4) * sitofp_i32_f64(L2_sizze_6877)) + * hist_RACE_exp_6878 + ), + ) + ) + hist_H_chk_6880 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879)) + hist_k_max_6881 = fmin64( + ( + ( + np.float64(0.4) + * (sitofp_i32_f64(L2_sizze_6877) / sitofp_i32_f64(np.int32(8))) + ) + * hist_RACE_exp_6878 + ), + sitofp_i32_f64(n_5765), + ) / sitofp_i32_f64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)) + hist_u_6882 = np.int64(2) + hist_C_6883 = fmin64( + sitofp_i32_f64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)), + (sitofp_i32_f64(hist_u_6882 * hist_H_chk_6880) / hist_k_max_6881), + ) + hist_M_6884 = np.int32(1) + num_subhistos_6815 = sext_i32_i64(hist_M_6884) + if hist_M_6884 == np.int32(1): + defunc_0_map_res_subhistos_mem_6816 = mem_6758 + else: + if num_subhistos_6815 == np.int64(1): + defunc_0_map_res_subhistos_mem_6816 = mem_6758 + else: + defunc_0_map_res_subhistos_mem_6816 = opencl_alloc( + self, + ((num_subhistos_6815 * np.int64(256)) * np.int64(8)), + "defunc_0_map_res_subhistos_mem_6816", + ) + self.futhark_builtinzhreplicate_i64( + defunc_0_map_res_subhistos_mem_6816, + (num_subhistos_6815 * np.int64(256)), + np.int64(0), + ) + lmad_copy_gpu2gpu( + self, + ct.c_int64, + defunc_0_map_res_subhistos_mem_6816, + np.int64(0), + [np.int64(1)], + mem_6758, + np.int64(0), + [np.int64(1)], + [np.int64(256)], + ) + chk_i_6885 = np.int32(0) + one_7033 = np.int32(1) + for counter_7032 in range(hist_S_6879): + hist_H_chk_6886 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879)) + if ( + 1 + * ( + np.int64(num_tblocks_6307) + * self.sizes["byte_histogram.seghist_tblock_size_6304"] + ) + ) != 0: + self.byte_histogramziseghist_global_6312_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + self.global_failure, + ct.c_int64(n_5765), + ct.c_int64(num_tblocks_6307), + ct.c_int64(num_subhistos_6815), + ct.c_int32(chk_i_6885), + ct.c_int64(hist_H_chk_6886), + xs_mem_6757, + defunc_0_map_res_subhistos_mem_6816, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.byte_histogramziseghist_global_6312_var, + ( + ( + np.int64(num_tblocks_6307) + * self.sizes["byte_histogram.seghist_tblock_size_6304"] + ), + ), + (self.sizes["byte_histogram.seghist_tblock_size_6304"],), + ) + if synchronous: + sync(self) + chk_i_6885 += one_7033 + if num_subhistos_6815 == np.int64(1): + mem_6758 = defunc_0_map_res_subhistos_mem_6816 + else: + chunk_sizze_6902 = np.int64(1) + if slt64( + (num_subhistos_6815 * np.int64(2)), + (seghist_tblock_sizze_6305 * chunk_sizze_6902), + ): + segment_sizze_nonzzero_6903 = smax64(np.int64(1), num_subhistos_6815) + num_threads_6904 = seghist_tblock_sizze_6305 * seghist_tblock_sizze_6305 + if ( + 1 + * ( + np.int64(num_tblocks_6307) + * self.sizes["byte_histogram.seghist_tblock_size_6304"] + ) + ) != 0: + self.byte_histogramzisegred_small_6901_var.set_args( + cl.LocalMemory( + max( + ( + (np.int64(8) * seghist_tblock_sizze_6305) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(8) * seghist_tblock_sizze_6305), + np.int64(8), + ) + ), + np.int64(8), + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(num_tblocks_6307), + ct.c_int64(num_subhistos_6815), + ct.c_int64(segment_sizze_nonzzero_6903), + mem_6758, + defunc_0_map_res_subhistos_mem_6816, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.byte_histogramzisegred_small_6901_var, + ( + ( + np.int64(num_tblocks_6307) + * self.sizes["byte_histogram.seghist_tblock_size_6304"] + ), + ), + (self.sizes["byte_histogram.seghist_tblock_size_6304"],), + ) + if synchronous: + sync(self) + else: + blocks_per_segment_6932 = sdiv_up64( + num_tblocks_6307, smax64(np.int64(1), np.int64(256)) + ) + q_6933 = sdiv_up64( + num_subhistos_6815, + ((seghist_tblock_sizze_6305 * blocks_per_segment_6932) * chunk_sizze_6902), + ) + num_virtblocks_6934 = blocks_per_segment_6932 * np.int64(256) + threads_per_segment_6935 = blocks_per_segment_6932 * seghist_tblock_sizze_6305 + segred_tmp_mem_6936 = opencl_alloc( + self, (np.int64(8) * num_virtblocks_6934), "segred_tmp_mem_6936" + ) + if ( + 1 + * ( + np.int64(num_tblocks_6307) + * self.sizes["byte_histogram.seghist_tblock_size_6304"] + ) + ) != 0: + self.byte_histogramzisegred_large_6901_var.set_args( + cl.LocalMemory( + max( + ( + np.int32(8) + + ( + (np.int64(8) * seghist_tblock_sizze_6305) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(8) * seghist_tblock_sizze_6305), + np.int64(8), + ) + ), + np.int64(8), + ) + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(num_tblocks_6307), + ct.c_int64(num_subhistos_6815), + ct.c_int64(blocks_per_segment_6932), + ct.c_int64(q_6933), + ct.c_int64(num_virtblocks_6934), + ct.c_int64(threads_per_segment_6935), + mem_6758, + defunc_0_map_res_subhistos_mem_6816, + segred_tmp_mem_6936, + self.constants["counters_mem_6938"], + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.byte_histogramzisegred_large_6901_var, + ( + ( + np.int64(num_tblocks_6307) + * self.sizes["byte_histogram.seghist_tblock_size_6304"] + ), + ), + (self.sizes["byte_histogram.seghist_tblock_size_6304"],), + ) + if synchronous: + sync(self) + mem_out_6793 = mem_6758 + return mem_out_6793 + + def futhark_entry_chunked_entropy(self, xs_mem_6757, n_6046, chunk_sizze_6047): + zzero_6194 = chunk_sizze_6047 == np.int64(0) + nonzzero_6195 = not (zzero_6194) + nonzzero_cert_6196 = True + assert ( + nonzzero_6195 + ), "Error: {}\n\nBacktrace:\n-> #0 ofrak_gpu/entropy.fut:12:9-23\n #1 ofrak_gpu/entropy.fut:11:1-13:93\n".format( + "division by zero" + ) + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 = sdiv64(n_6046, chunk_sizze_6047) + bounds_invalid_upwards_6198 = slt64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, np.int64(0) + ) + valid_6199 = not (bounds_invalid_upwards_6198) + range_valid_c_6200 = True + assert valid_6199, ( + "Error: %s%d%s%d%s\n\nBacktrace:\n-> #0 ofrak_gpu/entropy.fut:12:4-24\n #1 ofrak_gpu/entropy.fut:11:1-13:93\n" + % ( + "Range ", + np.int64(0), + "..<", + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, + " is invalid.", + ) + ) + suff_outer_par_6346 = ( + self.sizes["chunked_entropy.suff_outer_par_0"] + <= dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 + ) + segmap_tblock_sizze_6600 = self.sizes["chunked_entropy.segmap_tblock_size_6577"] + segmap_tblock_sizze_6639 = self.sizes["chunked_entropy.segmap_tblock_size_6523"] + max_num_tblocks_6795 = self.sizes["chunked_entropy.segmap_num_tblocks_6525"] + num_tblocks_6640 = sext_i64_i32( + smax64( + np.int64(1), + smin64( + sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6639 + ), + max_num_tblocks_6795, + ), + ) + ) + nest_sizze_6662 = np.int64(256) * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 + segred_tblock_sizze_6663 = self.sizes["chunked_entropy.segred_tblock_size_6473"] + max_num_tblocks_6796 = self.sizes["chunked_entropy.segred_num_tblocks_6475"] + num_tblocks_6664 = sext_i64_i32( + smax64( + np.int64(1), + smin64(sdiv_up64(nest_sizze_6662, segred_tblock_sizze_6663), max_num_tblocks_6796), + ) + ) + segmap_tblock_sizze_6683 = self.sizes["chunked_entropy.segmap_tblock_size_6456"] + segmap_tblock_sizze_6401 = self.sizes["chunked_entropy.segmap_tblock_size_6350"] + binop_y_6768 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 - np.int64(1) + binop_x_6770 = smax64(np.int64(0), binop_y_6768) + binop_y_6772 = np.int64(255) * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 + binop_y_6773 = smax64(np.int64(0), binop_y_6772) + binop_y_6774 = binop_x_6770 + binop_y_6773 + binop_y_6775 = np.int64(1) + binop_y_6774 + bytes_6776 = np.int64(8) * binop_y_6775 + bytes_6779 = np.int64(4) * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 + shared_memory_capacity_6934 = self.max_shared_memory + if suff_outer_par_6346 and sle64(np.int64(0), shared_memory_capacity_6934): + segmap_usable_groups_6402 = sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6401 + ) + mem_6791 = opencl_alloc( + self, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, "mem_6791" + ) + virt_num_tblocks_6797 = sext_i64_i32( + sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6401 + ) + ) + if ( + 1 + * ( + np.int64(segmap_usable_groups_6402) + * self.sizes["chunked_entropy.segmap_tblock_size_6350"] + ) + ) != 0: + self.chunked_entropyzisegmap_6405_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + self.global_failure, + self.failure_is_an_option, + self.global_failure_args, + ct.c_int64(n_6046), + ct.c_int64(chunk_sizze_6047), + ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197), + xs_mem_6757, + mem_6791, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.chunked_entropyzisegmap_6405_var, + ( + ( + np.int64(segmap_usable_groups_6402) + * self.sizes["chunked_entropy.segmap_tblock_size_6350"] + ), + ), + (self.sizes["chunked_entropy.segmap_tblock_size_6350"],), + ) + if synchronous: + sync(self) + self.failure_is_an_option = np.int32(1) + ext_mem_6792 = mem_6791 + else: + segmap_usable_groups_6601 = sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6600 + ) + mem_6759 = opencl_alloc(self, np.int64(0), "mem_6759") + virt_num_tblocks_6809 = sext_i64_i32( + sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6600 + ) + ) + if ( + 1 + * ( + np.int64(segmap_usable_groups_6601) + * self.sizes["chunked_entropy.segmap_tblock_size_6577"] + ) + ) != 0: + self.chunked_entropyzisegmap_6606_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + self.global_failure, + self.failure_is_an_option, + self.global_failure_args, + ct.c_int64(n_6046), + ct.c_int64(chunk_sizze_6047), + ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197), + mem_6759, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.chunked_entropyzisegmap_6606_var, + ( + ( + np.int64(segmap_usable_groups_6601) + * self.sizes["chunked_entropy.segmap_tblock_size_6577"] + ), + ), + (self.sizes["chunked_entropy.segmap_tblock_size_6577"],), + ) + if synchronous: + sync(self) + self.failure_is_an_option = np.int32(1) + mem_6777 = opencl_alloc(self, bytes_6776, "mem_6777") + virt_num_tblocks_6818 = sext_i64_i32( + sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6639 + ) + ) + if ( + 1 + * ( + np.int64(num_tblocks_6640) + * self.sizes["chunked_entropy.segmap_tblock_size_6523"] + ) + ) != 0: + self.chunked_entropyzisegmap_6645_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + self.global_failure, + ct.c_int64(chunk_sizze_6047), + ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197), + ct.c_int64(num_tblocks_6640), + ct.c_int32(virt_num_tblocks_6818), + xs_mem_6757, + mem_6759, + mem_6777, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.chunked_entropyzisegmap_6645_var, + ( + ( + np.int64(num_tblocks_6640) + * self.sizes["chunked_entropy.segmap_tblock_size_6523"] + ), + ), + (self.sizes["chunked_entropy.segmap_tblock_size_6523"],), + ) + if synchronous: + sync(self) + mem_6759 = None + mem_6780 = opencl_alloc(self, bytes_6779, "mem_6780") + chunk_sizze_6833 = np.int64(1) + if slt64(np.int64(512), (segred_tblock_sizze_6663 * chunk_sizze_6833)): + segment_sizze_nonzzero_6834 = smax64(np.int64(1), np.int64(256)) + num_threads_6835 = segred_tblock_sizze_6663 * segred_tblock_sizze_6663 + if ( + 1 + * ( + np.int64(num_tblocks_6664) + * self.sizes["chunked_entropy.segred_tblock_size_6473"] + ) + ) != 0: + self.chunked_entropyzisegred_small_6669_var.set_args( + cl.LocalMemory( + max( + ( + (np.int64(4) * segred_tblock_sizze_6663) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(4) * segred_tblock_sizze_6663), + np.int64(8), + ) + ), + np.int64(8), + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(chunk_sizze_6047), + ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197), + ct.c_int64(num_tblocks_6664), + ct.c_int64(segment_sizze_nonzzero_6834), + mem_6777, + mem_6780, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.chunked_entropyzisegred_small_6669_var, + ( + ( + np.int64(num_tblocks_6664) + * self.sizes["chunked_entropy.segred_tblock_size_6473"] + ), + ), + (self.sizes["chunked_entropy.segred_tblock_size_6473"],), + ) + if synchronous: + sync(self) + else: + blocks_per_segment_6862 = sdiv_up64( + num_tblocks_6664, + smax64(np.int64(1), dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197), + ) + q_6863 = sdiv_up64( + np.int64(256), + ((segred_tblock_sizze_6663 * blocks_per_segment_6862) * chunk_sizze_6833), + ) + num_virtblocks_6864 = ( + blocks_per_segment_6862 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 + ) + threads_per_segment_6865 = blocks_per_segment_6862 * segred_tblock_sizze_6663 + segred_tmp_mem_6866 = opencl_alloc( + self, (np.int64(4) * num_virtblocks_6864), "segred_tmp_mem_6866" + ) + if ( + 1 + * ( + np.int64(num_tblocks_6664) + * self.sizes["chunked_entropy.segred_tblock_size_6473"] + ) + ) != 0: + self.chunked_entropyzisegred_large_6669_var.set_args( + cl.LocalMemory( + max( + ( + np.int32(8) + + ( + (np.int64(4) * segred_tblock_sizze_6663) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(4) * segred_tblock_sizze_6663), + np.int64(8), + ) + ), + np.int64(8), + ) + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(chunk_sizze_6047), + ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197), + ct.c_int64(num_tblocks_6664), + ct.c_int64(blocks_per_segment_6862), + ct.c_int64(q_6863), + ct.c_int64(num_virtblocks_6864), + ct.c_int64(threads_per_segment_6865), + mem_6777, + mem_6780, + segred_tmp_mem_6866, + self.constants["counters_mem_6868"], + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.chunked_entropyzisegred_large_6669_var, + ( + ( + np.int64(num_tblocks_6664) + * self.sizes["chunked_entropy.segred_tblock_size_6473"] + ), + ), + (self.sizes["chunked_entropy.segred_tblock_size_6473"],), + ) + if synchronous: + sync(self) + mem_6777 = None + segmap_usable_groups_6684 = sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6683 + ) + mem_6782 = opencl_alloc( + self, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, "mem_6782" + ) + virt_num_tblocks_6925 = sext_i64_i32( + sdiv_up64( + dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6683 + ) + ) + if ( + 1 + * ( + np.int64(segmap_usable_groups_6684) + * self.sizes["chunked_entropy.segmap_tblock_size_6456"] + ) + ) != 0: + self.chunked_entropyzisegmap_6687_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + self.global_failure, + ct.c_int64(chunk_sizze_6047), + ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197), + mem_6780, + mem_6782, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.chunked_entropyzisegmap_6687_var, + ( + ( + np.int64(segmap_usable_groups_6684) + * self.sizes["chunked_entropy.segmap_tblock_size_6456"] + ), + ), + (self.sizes["chunked_entropy.segmap_tblock_size_6456"],), + ) + if synchronous: + sync(self) + mem_6780 = None + ext_mem_6792 = mem_6782 + mem_out_6793 = ext_mem_6792 + prim_out_6794 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 + return (mem_out_6793, prim_out_6794) + + def futhark_entry_entropy(self, xs_mem_6757, n_5907): + mem_6758 = opencl_alloc(self, np.int64(2048), "mem_6758") + self.futhark_builtinzhreplicate_i64(mem_6758, np.int64(256), np.int64(0)) + seghist_tblock_sizze_6321 = self.sizes["entropy.seghist_tblock_size_6320"] + max_num_tblocks_6814 = self.sizes["entropy.seghist_num_tblocks_6322"] + num_tblocks_6323 = sext_i64_i32( + smax64( + np.int64(1), + smin64(sdiv_up64(n_5907, seghist_tblock_sizze_6321), max_num_tblocks_6814), + ) + ) + h_6818 = np.int64(2048) + seg_h_6819 = np.int64(2048) + if seg_h_6819 == np.int64(0): + pass + else: + hist_H_6820 = np.int64(256) + hist_el_sizze_6821 = sdiv_up64(h_6818, hist_H_6820) + hist_N_6822 = n_5907 + hist_RF_6823 = np.int32(1) + hist_L_6824 = self.max_shared_memory + max_tblock_sizze_6825 = self.max_thread_block_size + num_tblocks_6826 = sdiv_up64( + sext_i32_i64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)), + max_tblock_sizze_6825, + ) + hist_m_prime_6827 = sitofp_i64_f64( + smin64( + squot64(hist_L_6824, hist_el_sizze_6821), + sdiv_up64(hist_N_6822, num_tblocks_6826), + ) + ) / sitofp_i64_f64(hist_H_6820) + hist_M0_6828 = smax64( + np.int64(1), smin64(fptosi_f64_i64(hist_m_prime_6827), max_tblock_sizze_6825) + ) + hist_Nout_6829 = np.int64(1) + hist_Nin_6830 = n_5907 + work_asymp_M_max_6831 = squot64( + (hist_Nout_6829 * hist_N_6822), ((np.int64(2) * num_tblocks_6826) * hist_H_6820) + ) + hist_M_6832 = sext_i64_i32(smin64(hist_M0_6828, work_asymp_M_max_6831)) + hist_C_6833 = sdiv_up64( + max_tblock_sizze_6825, sext_i32_i64(smax32(np.int32(1), hist_M_6832)) + ) + local_mem_needed_6834 = hist_el_sizze_6821 * sext_i32_i64(hist_M_6832) + hist_S_6835 = sext_i64_i32( + sdiv_up64(((hist_H_6820 * local_mem_needed_6834) + np.int64(1)), hist_L_6824) + ) + if sle64(hist_H_6820, hist_Nin_6830) and ( + sle64(local_mem_needed_6834, hist_L_6824) + and ( + sle32(hist_S_6835, np.int32(3)) + and ( + sle64(hist_C_6833, max_tblock_sizze_6825) + and slt32(np.int32(0), hist_M_6832) + ) + ) + ): + num_subhistos_6815 = num_tblocks_6826 + if num_subhistos_6815 == np.int64(1): + defunc_0_map_res_subhistos_mem_6816 = mem_6758 + else: + defunc_0_map_res_subhistos_mem_6816 = opencl_alloc( + self, + ((num_subhistos_6815 * np.int64(256)) * np.int64(8)), + "defunc_0_map_res_subhistos_mem_6816", + ) + self.futhark_builtinzhreplicate_i64( + defunc_0_map_res_subhistos_mem_6816, + (num_subhistos_6815 * np.int64(256)), + np.int64(0), + ) + lmad_copy_gpu2gpu( + self, + ct.c_int64, + defunc_0_map_res_subhistos_mem_6816, + np.int64(0), + [np.int64(1)], + mem_6758, + np.int64(0), + [np.int64(1)], + [np.int64(256)], + ) + chk_i_6836 = np.int32(0) + one_7035 = np.int32(1) + for counter_7034 in range(hist_S_6835): + num_segments_6837 = np.int64(1) + hist_H_chk_6838 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6835)) + histo_sizze_6839 = hist_H_chk_6838 + init_per_thread_6840 = sext_i64_i32( + sdiv_up64( + (sext_i32_i64(hist_M_6832) * histo_sizze_6839), max_tblock_sizze_6825 + ) + ) + if (1 * (np.int64(num_tblocks_6826) * self.max_thread_block_size)) != 0: + self.entropyziseghist_local_6328_var.set_args( + cl.LocalMemory( + max( + ( + (np.int64(8) * (hist_M_6832 * hist_H_chk_6838)) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(8) * (hist_M_6832 * hist_H_chk_6838)), + np.int64(8), + ) + ), + np.int64(8), + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(n_5907), + ct.c_int64(num_subhistos_6815), + ct.c_int64(num_tblocks_6826), + ct.c_int32(hist_M_6832), + ct.c_int32(chk_i_6836), + ct.c_int64(num_segments_6837), + ct.c_int64(hist_H_chk_6838), + ct.c_int64(histo_sizze_6839), + ct.c_int32(init_per_thread_6840), + xs_mem_6757, + defunc_0_map_res_subhistos_mem_6816, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.entropyziseghist_local_6328_var, + ((np.int64(num_tblocks_6826) * self.max_thread_block_size),), + (self.max_thread_block_size,), + ) + if synchronous: + sync(self) + chk_i_6836 += one_7035 + else: + hist_H_6872 = np.int64(256) + hist_RF_6873 = (np.float64(0.0) + sitofp_i32_f64(np.int64(1))) / np.float64(1.0) + hist_el_sizze_6874 = np.int32(8) + hist_C_max_6875 = fmin64( + sitofp_i32_f64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)), + (sitofp_i32_f64(hist_H_6872) / np.float64(2.0)), + ) + hist_M_min_6876 = smax32( + np.int32(1), + sext_i64_i32( + fptosi_f64_i64( + sitofp_i32_f64( + sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321) + ) + / hist_C_max_6875 + ) + ), + ) + L2_sizze_6877 = self.max_cache + hist_RACE_exp_6878 = fmax64( + np.float64(1.0), + ( + (np.float64(0.75) * hist_RF_6873) + / (np.float64(64.0) / sitofp_i32_f64(hist_el_sizze_6874)) + ), + ) + if slt64(n_5907, hist_H_6872): + hist_S_6879 = np.int32(1) + else: + hist_S_6879 = sext_i64_i32( + sdiv_up64( + ( + (sext_i32_i64(hist_M_min_6876) * hist_H_6872) + * sext_i32_i64(hist_el_sizze_6874) + ), + fptosi_f64_i64( + (np.float64(0.4) * sitofp_i32_f64(L2_sizze_6877)) + * hist_RACE_exp_6878 + ), + ) + ) + hist_H_chk_6880 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879)) + hist_k_max_6881 = fmin64( + ( + ( + np.float64(0.4) + * (sitofp_i32_f64(L2_sizze_6877) / sitofp_i32_f64(np.int32(8))) + ) + * hist_RACE_exp_6878 + ), + sitofp_i32_f64(n_5907), + ) / sitofp_i32_f64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)) + hist_u_6882 = np.int64(2) + hist_C_6883 = fmin64( + sitofp_i32_f64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)), + (sitofp_i32_f64(hist_u_6882 * hist_H_chk_6880) / hist_k_max_6881), + ) + hist_M_6884 = np.int32(1) + num_subhistos_6815 = sext_i32_i64(hist_M_6884) + if hist_M_6884 == np.int32(1): + defunc_0_map_res_subhistos_mem_6816 = mem_6758 + else: + if num_subhistos_6815 == np.int64(1): + defunc_0_map_res_subhistos_mem_6816 = mem_6758 + else: + defunc_0_map_res_subhistos_mem_6816 = opencl_alloc( + self, + ((num_subhistos_6815 * np.int64(256)) * np.int64(8)), + "defunc_0_map_res_subhistos_mem_6816", + ) + self.futhark_builtinzhreplicate_i64( + defunc_0_map_res_subhistos_mem_6816, + (num_subhistos_6815 * np.int64(256)), + np.int64(0), + ) + lmad_copy_gpu2gpu( + self, + ct.c_int64, + defunc_0_map_res_subhistos_mem_6816, + np.int64(0), + [np.int64(1)], + mem_6758, + np.int64(0), + [np.int64(1)], + [np.int64(256)], + ) + chk_i_6885 = np.int32(0) + one_7037 = np.int32(1) + for counter_7036 in range(hist_S_6879): + hist_H_chk_6886 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879)) + if ( + 1 + * ( + np.int64(num_tblocks_6323) + * self.sizes["entropy.seghist_tblock_size_6320"] + ) + ) != 0: + self.entropyziseghist_global_6328_var.set_args( + cl.LocalMemory(max(np.int64(0), 1)), + self.global_failure, + ct.c_int64(n_5907), + ct.c_int64(num_tblocks_6323), + ct.c_int64(num_subhistos_6815), + ct.c_int32(chk_i_6885), + ct.c_int64(hist_H_chk_6886), + xs_mem_6757, + defunc_0_map_res_subhistos_mem_6816, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.entropyziseghist_global_6328_var, + ( + ( + np.int64(num_tblocks_6323) + * self.sizes["entropy.seghist_tblock_size_6320"] + ), + ), + (self.sizes["entropy.seghist_tblock_size_6320"],), + ) + if synchronous: + sync(self) + chk_i_6885 += one_7037 + if num_subhistos_6815 == np.int64(1): + mem_6758 = defunc_0_map_res_subhistos_mem_6816 + else: + chunk_sizze_6902 = np.int64(1) + if slt64( + (num_subhistos_6815 * np.int64(2)), + (seghist_tblock_sizze_6321 * chunk_sizze_6902), + ): + segment_sizze_nonzzero_6903 = smax64(np.int64(1), num_subhistos_6815) + num_threads_6904 = seghist_tblock_sizze_6321 * seghist_tblock_sizze_6321 + if ( + 1 + * ( + np.int64(num_tblocks_6323) + * self.sizes["entropy.seghist_tblock_size_6320"] + ) + ) != 0: + self.entropyzisegred_small_6901_var.set_args( + cl.LocalMemory( + max( + ( + (np.int64(8) * seghist_tblock_sizze_6321) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(8) * seghist_tblock_sizze_6321), + np.int64(8), + ) + ), + np.int64(8), + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(num_tblocks_6323), + ct.c_int64(num_subhistos_6815), + ct.c_int64(segment_sizze_nonzzero_6903), + mem_6758, + defunc_0_map_res_subhistos_mem_6816, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.entropyzisegred_small_6901_var, + ( + ( + np.int64(num_tblocks_6323) + * self.sizes["entropy.seghist_tblock_size_6320"] + ), + ), + (self.sizes["entropy.seghist_tblock_size_6320"],), + ) + if synchronous: + sync(self) + else: + blocks_per_segment_6932 = sdiv_up64( + num_tblocks_6323, smax64(np.int64(1), np.int64(256)) + ) + q_6933 = sdiv_up64( + num_subhistos_6815, + ((seghist_tblock_sizze_6321 * blocks_per_segment_6932) * chunk_sizze_6902), + ) + num_virtblocks_6934 = blocks_per_segment_6932 * np.int64(256) + threads_per_segment_6935 = blocks_per_segment_6932 * seghist_tblock_sizze_6321 + segred_tmp_mem_6936 = opencl_alloc( + self, (np.int64(8) * num_virtblocks_6934), "segred_tmp_mem_6936" + ) + if ( + 1 + * ( + np.int64(num_tblocks_6323) + * self.sizes["entropy.seghist_tblock_size_6320"] + ) + ) != 0: + self.entropyzisegred_large_6901_var.set_args( + cl.LocalMemory( + max( + ( + np.int32(8) + + ( + (np.int64(8) * seghist_tblock_sizze_6321) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(8) * seghist_tblock_sizze_6321), + np.int64(8), + ) + ), + np.int64(8), + ) + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_int64(num_tblocks_6323), + ct.c_int64(num_subhistos_6815), + ct.c_int64(blocks_per_segment_6932), + ct.c_int64(q_6933), + ct.c_int64(num_virtblocks_6934), + ct.c_int64(threads_per_segment_6935), + mem_6758, + defunc_0_map_res_subhistos_mem_6816, + segred_tmp_mem_6936, + self.constants["counters_mem_6938"], + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.entropyzisegred_large_6901_var, + ( + ( + np.int64(num_tblocks_6323) + * self.sizes["entropy.seghist_tblock_size_6320"] + ), + ), + (self.sizes["entropy.seghist_tblock_size_6320"],), + ) + if synchronous: + sync(self) + i64_res_6254 = sitofp_i64_f32(n_5907) + segred_tblock_sizze_6337 = self.sizes["entropy.segred_tblock_size_6336"] + max_num_tblocks_6995 = self.sizes["entropy.segred_num_tblocks_6338"] + num_tblocks_6339 = sext_i64_i32( + smax64( + np.int64(1), + smin64(sdiv_up64(np.int64(256), segred_tblock_sizze_6337), max_num_tblocks_6995), + ) + ) + mem_6761 = opencl_alloc(self, np.int64(4), "mem_6761") + chunk_sizze_6996 = np.int64(1) + segred_tmp_mem_6999 = opencl_alloc( + self, (np.int64(4) * num_tblocks_6339), "segred_tmp_mem_6999" + ) + num_threads_7001 = num_tblocks_6339 * segred_tblock_sizze_6337 + if (1 * (np.int64(num_tblocks_6339) * self.sizes["entropy.segred_tblock_size_6336"])) != 0: + self.entropyzisegred_nonseg_6344_var.set_args( + cl.LocalMemory( + max( + ( + np.int32(8) + + ( + (np.int64(4) * segred_tblock_sizze_6337) + + srem64( + ( + np.int64(8) + - srem64( + (np.int64(4) * segred_tblock_sizze_6337), np.int64(8) + ) + ), + np.int64(8), + ) + ) + ), + 1, + ) + ), + self.global_failure, + ct.c_float(i64_res_6254), + ct.c_int64(num_tblocks_6339), + ct.c_int64(num_threads_7001), + mem_6758, + mem_6761, + self.constants["counters_mem_6997"], + segred_tmp_mem_6999, + ) + cl.enqueue_nd_range_kernel( + self.queue, + self.entropyzisegred_nonseg_6344_var, + ((np.int64(num_tblocks_6339) * self.sizes["entropy.segred_tblock_size_6336"]),), + (self.sizes["entropy.segred_tblock_size_6336"],), + ) + if synchronous: + sync(self) + mem_6758 = None + read_res_7038 = np.empty(1, dtype=ct.c_float) + cl.enqueue_copy( + self.queue, + read_res_7038, + mem_6761, + src_offset=(np.int64(np.int64(0)) * 4), + is_blocking=synchronous, + ) + sync(self) + defunc_0_f_res_6297 = read_res_7038[0] + mem_6761 = None + zs_lhs_6270 = np.float32(-1.0) * defunc_0_f_res_6297 + log2_res_6272 = futhark_log2_32(i64_res_6254) + lifted_lambda_res_6273 = zs_lhs_6270 / log2_res_6272 + prim_out_6793 = lifted_lambda_res_6273 + return prim_out_6793 + + def byte_histogram(self, xs_mem_6757_ext): + n_5765 = None + try: + assert (type(xs_mem_6757_ext) in [np.ndarray, cl.array.Array]) and ( + xs_mem_6757_ext.dtype == np.uint8 + ), "Parameter has unexpected type" + if n_5765 == None: + n_5765 = np.int64(xs_mem_6757_ext.shape[0]) + else: + assert ( + n_5765 == xs_mem_6757_ext.shape[0] + ), "Error: entry point arguments have invalid sizes." + if type(xs_mem_6757_ext) == cl.array.Array: + xs_mem_6757 = xs_mem_6757_ext.data + else: + xs_mem_6757 = opencl_alloc(self, np.int64(xs_mem_6757_ext.nbytes), "xs_mem_6757") + if np.int64(xs_mem_6757_ext.nbytes) != 0: + cl.enqueue_copy( + self.queue, + xs_mem_6757, + normaliseArray(xs_mem_6757_ext), + is_blocking=synchronous, + ) + except (TypeError, AssertionError) as e: + raise TypeError( + "Argument #0 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format( + "[]u8", type(xs_mem_6757_ext), xs_mem_6757_ext + ) + ) + time_start = time.time() + with np.errstate(divide="ignore", over="ignore", under="ignore", invalid="ignore"): + mem_out_6793 = self.futhark_entry_byte_histogram(xs_mem_6757, n_5765) + runtime = int(time.time() * 1000000) - int(time_start * 1000000) + sync(self) + return cl.array.Array(self.queue, (np.int64(256),), np.int64, data=mem_out_6793) + + def chunked_entropy(self, chunk_sizze_6047_ext, xs_mem_6757_ext): + n_6046 = None + try: + chunk_sizze_6047 = np.int64(ct.c_int64(chunk_sizze_6047_ext)) + except (TypeError, AssertionError) as e: + raise TypeError( + "Argument #0 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format( + "i64", type(chunk_sizze_6047_ext), chunk_sizze_6047_ext + ) + ) + try: + assert (type(xs_mem_6757_ext) in [np.ndarray, cl.array.Array]) and ( + xs_mem_6757_ext.dtype == np.uint8 + ), "Parameter has unexpected type" + if n_6046 == None: + n_6046 = np.int64(xs_mem_6757_ext.shape[0]) + else: + assert ( + n_6046 == xs_mem_6757_ext.shape[0] + ), "Error: entry point arguments have invalid sizes." + if type(xs_mem_6757_ext) == cl.array.Array: + xs_mem_6757 = xs_mem_6757_ext.data + else: + xs_mem_6757 = opencl_alloc(self, np.int64(xs_mem_6757_ext.nbytes), "xs_mem_6757") + if np.int64(xs_mem_6757_ext.nbytes) != 0: + cl.enqueue_copy( + self.queue, + xs_mem_6757, + normaliseArray(xs_mem_6757_ext), + is_blocking=synchronous, + ) + except (TypeError, AssertionError) as e: + raise TypeError( + "Argument #1 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format( + "[]u8", type(xs_mem_6757_ext), xs_mem_6757_ext + ) + ) + time_start = time.time() + with np.errstate(divide="ignore", over="ignore", under="ignore", invalid="ignore"): + (mem_out_6793, prim_out_6794) = self.futhark_entry_chunked_entropy( + xs_mem_6757, n_6046, chunk_sizze_6047 + ) + runtime = int(time.time() * 1000000) - int(time_start * 1000000) + sync(self) + return cl.array.Array(self.queue, (prim_out_6794,), np.uint8, data=mem_out_6793) + + def entropy(self, xs_mem_6757_ext): + n_5907 = None + try: + assert (type(xs_mem_6757_ext) in [np.ndarray, cl.array.Array]) and ( + xs_mem_6757_ext.dtype == np.uint8 + ), "Parameter has unexpected type" + if n_5907 == None: + n_5907 = np.int64(xs_mem_6757_ext.shape[0]) + else: + assert ( + n_5907 == xs_mem_6757_ext.shape[0] + ), "Error: entry point arguments have invalid sizes." + if type(xs_mem_6757_ext) == cl.array.Array: + xs_mem_6757 = xs_mem_6757_ext.data + else: + xs_mem_6757 = opencl_alloc(self, np.int64(xs_mem_6757_ext.nbytes), "xs_mem_6757") + if np.int64(xs_mem_6757_ext.nbytes) != 0: + cl.enqueue_copy( + self.queue, + xs_mem_6757, + normaliseArray(xs_mem_6757_ext), + is_blocking=synchronous, + ) + except (TypeError, AssertionError) as e: + raise TypeError( + "Argument #0 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format( + "[]u8", type(xs_mem_6757_ext), xs_mem_6757_ext + ) + ) + time_start = time.time() + with np.errstate(divide="ignore", over="ignore", under="ignore", invalid="ignore"): + prim_out_6793 = self.futhark_entry_entropy(xs_mem_6757, n_5907) + runtime = int(time.time() * 1000000) - int(time_start * 1000000) + sync(self) + return np.float32(prim_out_6793) diff --git a/ofrak_gpu/ofrak_gpu/run.py b/ofrak_gpu/ofrak_gpu/run_entropy.py similarity index 100% rename from ofrak_gpu/ofrak_gpu/run.py rename to ofrak_gpu/ofrak_gpu/run_entropy.py