diff --git a/ofrak_gpu/.coveragerc b/ofrak_gpu/.coveragerc
index e3a1cb90d..2d5ec5083 100644
--- a/ofrak_gpu/.coveragerc
+++ b/ofrak_gpu/.coveragerc
@@ -1,5 +1,5 @@
 [run]
 omit =
     ofrak_gpu/entropy.py
-    ofrak_gpu/run.py
-    ofrak_gpu/bench.py
+    ofrak_gpu/run_entropy.py
+    ofrak_gpu/bench_entropy.py
diff --git a/ofrak_gpu/.gitignore b/ofrak_gpu/.gitignore
deleted file mode 100644
index 732dc4d3c..000000000
--- a/ofrak_gpu/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# This file will be automatically generated from entropy.fut on build, and is quite unrult for VCS
-ofrak_gpu/entropy.py
diff --git a/ofrak_gpu/README.md b/ofrak_gpu/README.md
index 56a8e9af7..801945f2a 100644
--- a/ofrak_gpu/README.md
+++ b/ofrak_gpu/README.md
@@ -1 +1,25 @@
-# ofrak_gpu
+# OFRAK
+OFRAK (Open Firmware Reverse Analysis Konsole) is a binary analysis and modification platform that combines the ability to unpack, analyze, modify, and repack binaries.
+
+
+# Package: ofrak_gpu
+
+```
+OFRAK
+└───...
+└───ofrak_gpu  <-- //YOU ARE HERE//
+│   └───entropy.fut      // Futhark source code for GPU-bound entropy calculations
+│   └───entropy.py       // Pyopencl equivalent of entropy.fut, generated by the futhark compiler (see "Futhark Compilation")
+│   └───run_entropy.py   // CLI to run compute the entropy of a file
+└───...
+```
+
+This package contains the GPU-bound implementations of expensive computations performed by OFRAK.
+
+## Futhark Compilation
+The [Futhark language](https://futhark-lang.org/) is not required to install and run this module, nor will the Futhark compiler be installed into your Docker container. Instead, `entropy.py`, the compiled pyopencl output of `entropy.fut`, has been provided. If you would like to make any changes to `entropy.fut`, you will have to re-compile `entropy.py` with `futhark pyopencl --library entropy.fut -o entropy.py` - see the `make futhark` target. The `--library` flag must be used.
+## Dependencies
+This package requires numpy, pyopencl, and an OpenCL platform to run. If you don't have an OpenCL platform and want to test your code, consider [oclgrind, pocl, or another pip-installable CPU runtime for OpenCL](https://documen.tician.de/pyopencl/misc.html#enabling-access-to-cpus-and-gpus-via-py-opencl). oclgrind is installed by default by requirements-test to run tests. The Futhark compiler is not required, see "Futhark Compilation."
+
+## Testing
+This package maintains 100% test coverage of functions. See `ofrak_gpu_test`.
diff --git a/ofrak_gpu/max_entropy_256_B_windows.bin b/ofrak_gpu/max_entropy_256_B_windows.bin
deleted file mode 100644
index eb5d461ee..000000000
Binary files a/ofrak_gpu/max_entropy_256_B_windows.bin and /dev/null differ
diff --git a/ofrak_gpu/ofrak_gpu/entropy.py b/ofrak_gpu/ofrak_gpu/entropy.py
new file mode 100644
index 000000000..e79d169e7
--- /dev/null
+++ b/ofrak_gpu/ofrak_gpu/entropy.py
@@ -0,0 +1,12306 @@
+# Generated by Futhark 0.25.17.
+# Compiled with GHC 9.8.2.
+import sys
+import numpy as np
+import ctypes as ct
+
+# Stub code for OpenCL setup.
+
+import pyopencl as cl
+import numpy as np
+import sys
+
+if cl.version.VERSION < (2015, 2):
+    raise Exception(
+        "Futhark requires at least PyOpenCL version 2015.2.  Installed version is %s."
+        % cl.version.VERSION_TEXT
+    )
+
+TR_BLOCK_DIM = 16
+TR_TILE_DIM = TR_BLOCK_DIM * 2
+TR_ELEMS_PER_THREAD = 8
+
+
+def parse_preferred_device(s):
+    pref_num = 0
+    if len(s) > 1 and s[0] == "#":
+        i = 1
+        while i < len(s):
+            if not s[i].isdigit():
+                break
+            else:
+                pref_num = pref_num * 10 + int(s[i])
+            i += 1
+        while i < len(s) and s[i].isspace():
+            i += 1
+        return (s[i:], pref_num)
+    else:
+        return (s, 0)
+
+
+def get_prefered_context(interactive=False, platform_pref=None, device_pref=None):
+    if device_pref != None:
+        (device_pref, device_num) = parse_preferred_device(device_pref)
+    else:
+        device_num = 0
+
+    if interactive:
+        return cl.create_some_context(interactive=True)
+
+    def blacklisted(p, d):
+        return (
+            platform_pref == None
+            and device_pref == None
+            and p.name == "Apple"
+            and d.name.find("Intel(R) Core(TM)") >= 0
+        )
+
+    def platform_ok(p):
+        return not platform_pref or p.name.find(platform_pref) >= 0
+
+    def device_ok(d):
+        return not device_pref or d.name.find(device_pref) >= 0
+
+    device_matches = 0
+
+    for p in cl.get_platforms():
+        if not platform_ok(p):
+            continue
+        for d in p.get_devices():
+            if blacklisted(p, d) or not device_ok(d):
+                continue
+            if device_matches == device_num:
+                return cl.Context(devices=[d])
+            else:
+                device_matches += 1
+    raise Exception("No OpenCL platform and device matching constraints found.")
+
+
+def param_assignment(s):
+    name, value = s.split("=")
+    return (name, int(value))
+
+
+def check_types(self, required_types):
+    if "f64" in required_types:
+        if self.device.get_info(cl.device_info.PREFERRED_VECTOR_WIDTH_DOUBLE) == 0:
+            raise Exception(
+                "Program uses double-precision floats, but this is not supported on chosen device: %s"
+                % self.device.name
+            )
+
+
+def apply_size_heuristics(self, size_heuristics, sizes):
+    for platform_name, device_type, size, valuef in size_heuristics:
+        if (
+            sizes[size] == None
+            and self.platform.name.find(platform_name) >= 0
+            and (self.device.type & device_type) == device_type
+        ):
+            sizes[size] = valuef(self.device)
+    return sizes
+
+
+def to_c_str_rep(x):
+    if type(x) is bool or type(x) is np.bool_:
+        if x:
+            return "true"
+        else:
+            return "false"
+    else:
+        return str(x)
+
+
+def initialise_opencl_object(
+    self,
+    program_src="",
+    build_options=[],
+    command_queue=None,
+    interactive=False,
+    platform_pref=None,
+    device_pref=None,
+    default_group_size=None,
+    default_num_groups=None,
+    default_tile_size=None,
+    default_reg_tile_size=None,
+    default_threshold=None,
+    size_heuristics=[],
+    required_types=[],
+    all_sizes={},
+    user_sizes={},
+    constants=[],
+):
+    if command_queue is None:
+        self.ctx = get_prefered_context(interactive, platform_pref, device_pref)
+        self.queue = cl.CommandQueue(self.ctx)
+    else:
+        self.ctx = command_queue.context
+        self.queue = command_queue
+    self.device = self.queue.device
+    self.platform = self.device.platform
+    self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue))
+    device_type = self.device.type
+
+    check_types(self, required_types)
+
+    max_group_size = int(self.device.max_work_group_size)
+    max_tile_size = int(np.sqrt(self.device.max_work_group_size))
+
+    self.max_thread_block_size = max_group_size
+    self.max_tile_size = max_tile_size
+    self.max_threshold = 0
+    self.max_grid_size = 0
+
+    self.max_shared_memory = int(self.device.local_mem_size)
+
+    # Futhark reserves 4 bytes of local memory for its own purposes.
+    self.max_shared_memory -= 4
+
+    # See comment in rts/c/opencl.h.
+    if self.platform.name.find("NVIDIA CUDA") >= 0:
+        self.max_shared_memory -= 12
+    elif self.platform.name.find("AMD") >= 0:
+        self.max_shared_memory -= 16
+
+    self.max_registers = int(2**16)  # Not sure how to query for this.
+
+    self.max_cache = self.device.get_info(cl.device_info.GLOBAL_MEM_CACHE_SIZE)
+
+    if self.max_cache == 0:
+        self.max_cache = 1024 * 1024
+
+    self.free_list = {}
+
+    self.global_failure = self.pool.allocate(np.int32().itemsize)
+    cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize)
+    self.global_failure_args = self.pool.allocate(
+        np.int64().itemsize * (self.global_failure_args_max + 1)
+    )
+    self.failure_is_an_option = np.int32(0)
+
+    if "default_group_size" in sizes:
+        default_group_size = sizes["default_group_size"]
+        del sizes["default_group_size"]
+
+    if "default_num_groups" in sizes:
+        default_num_groups = sizes["default_num_groups"]
+        del sizes["default_num_groups"]
+
+    if "default_tile_size" in sizes:
+        default_tile_size = sizes["default_tile_size"]
+        del sizes["default_tile_size"]
+
+    if "default_reg_tile_size" in sizes:
+        default_reg_tile_size = sizes["default_reg_tile_size"]
+        del sizes["default_reg_tile_size"]
+
+    if "default_threshold" in sizes:
+        default_threshold = sizes["default_threshold"]
+        del sizes["default_threshold"]
+
+    default_group_size_set = default_group_size != None
+    default_tile_size_set = default_tile_size != None
+    default_sizes = apply_size_heuristics(
+        self,
+        size_heuristics,
+        {
+            "group_size": default_group_size,
+            "tile_size": default_tile_size,
+            "reg_tile_size": default_reg_tile_size,
+            "num_groups": default_num_groups,
+            "lockstep_width": None,
+            "threshold": default_threshold,
+        },
+    )
+    default_group_size = default_sizes["group_size"]
+    default_num_groups = default_sizes["num_groups"]
+    default_threshold = default_sizes["threshold"]
+    default_tile_size = default_sizes["tile_size"]
+    default_reg_tile_size = default_sizes["reg_tile_size"]
+    lockstep_width = default_sizes["lockstep_width"]
+
+    if default_group_size > max_group_size:
+        if default_group_size_set:
+            sys.stderr.write(
+                "Note: Device limits group size to {} (down from {})\n".format(
+                    max_tile_size, default_group_size
+                )
+            )
+        default_group_size = max_group_size
+
+    if default_tile_size > max_tile_size:
+        if default_tile_size_set:
+            sys.stderr.write(
+                "Note: Device limits tile size to {} (down from {})\n".format(
+                    max_tile_size, default_tile_size
+                )
+            )
+        default_tile_size = max_tile_size
+
+    for k, v in user_sizes.items():
+        if k in all_sizes:
+            all_sizes[k]["value"] = v
+        else:
+            raise Exception(
+                "Unknown size: {}\nKnown sizes: {}".format(k, " ".join(all_sizes.keys()))
+            )
+
+    self.sizes = {}
+    for k, v in all_sizes.items():
+        if v["class"] == "thread_block_size":
+            max_value = max_group_size
+            default_value = default_group_size
+        elif v["class"] == "grid_size":
+            max_value = max_group_size  # Intentional!
+            default_value = default_num_groups
+        elif v["class"] == "tile_size":
+            max_value = max_tile_size
+            default_value = default_tile_size
+        elif v["class"] == "reg_tile_size":
+            max_value = None
+            default_value = default_reg_tile_size
+        elif v["class"].startswith("threshold"):
+            max_value = None
+            default_value = default_threshold
+        else:
+            # Bespoke sizes have no limit or default.
+            max_value = None
+        if v["value"] == None:
+            self.sizes[k] = default_value
+        elif max_value != None and v["value"] > max_value:
+            sys.stderr.write(
+                "Note: Device limits {} to {} (down from {}\n".format(k, max_value, v["value"])
+            )
+            self.sizes[k] = max_value
+        else:
+            self.sizes[k] = v["value"]
+
+    # XXX: we perform only a subset of z-encoding here.  Really, the
+    # compiler should provide us with the variables to which
+    # parameters are mapped.
+    if len(program_src) >= 0:
+        build_options += [f"-DLOCKSTEP_WIDTH={lockstep_width}"]
+
+        build_options += ["-D{}={}".format("max_thread_block_size", max_group_size)]
+
+        build_options += [
+            "-D{}={}".format(
+                s.replace("z", "zz").replace(".", "zi").replace("#", "zh").replace("'", "zq"),
+                v,
+            )
+            for (s, v) in self.sizes.items()
+        ]
+
+        build_options += [f"-D{s}={to_c_str_rep(f())}" for (s, f) in constants]
+
+        if self.platform.name == "Oclgrind":
+            build_options += ["-DEMULATE_F16"]
+
+        build_options += [
+            f"-DTR_BLOCK_DIM={TR_BLOCK_DIM}",
+            f"-DTR_TILE_DIM={TR_TILE_DIM}",
+            f"-DTR_ELEMS_PER_THREAD={TR_ELEMS_PER_THREAD}",
+        ]
+
+        program = cl.Program(self.ctx, program_src).build(build_options)
+
+        self.transpose_kernels = {
+            1: {
+                "default": program.map_transpose_1b,
+                "low_height": program.map_transpose_1b_low_height,
+                "low_width": program.map_transpose_1b_low_width,
+                "small": program.map_transpose_1b_small,
+                "large": program.map_transpose_1b_large,
+            },
+            2: {
+                "default": program.map_transpose_2b,
+                "low_height": program.map_transpose_2b_low_height,
+                "low_width": program.map_transpose_2b_low_width,
+                "small": program.map_transpose_2b_small,
+                "large": program.map_transpose_2b_large,
+            },
+            4: {
+                "default": program.map_transpose_4b,
+                "low_height": program.map_transpose_4b_low_height,
+                "low_width": program.map_transpose_4b_low_width,
+                "small": program.map_transpose_4b_small,
+                "large": program.map_transpose_4b_large,
+            },
+            8: {
+                "default": program.map_transpose_8b,
+                "low_height": program.map_transpose_8b_low_height,
+                "low_width": program.map_transpose_8b_low_width,
+                "small": program.map_transpose_8b_small,
+                "large": program.map_transpose_8b_large,
+            },
+        }
+
+        self.copy_kernels = {
+            1: program.lmad_copy_1b,
+            2: program.lmad_copy_2b,
+            4: program.lmad_copy_4b,
+            8: program.lmad_copy_8b,
+        }
+
+        return program
+
+
+def opencl_alloc(self, min_size, tag):
+    min_size = 1 if min_size == 0 else min_size
+    assert min_size > 0
+    return self.pool.allocate(min_size)
+
+
+def opencl_free_all(self):
+    self.pool.free_held()
+
+
+def sync(self):
+    failure = np.empty(1, dtype=np.int32)
+    cl.enqueue_copy(self.queue, failure, self.global_failure, is_blocking=True)
+    self.failure_is_an_option = np.int32(0)
+    if failure[0] >= 0:
+        # Reset failure information.
+        cl.enqueue_fill_buffer(
+            self.queue,
+            self.global_failure,
+            np.int32(-1),
+            0,
+            np.int32().itemsize,
+        )
+
+        # Read failure args.
+        failure_args = np.empty(self.global_failure_args_max + 1, dtype=np.int64)
+        cl.enqueue_copy(
+            self.queue,
+            failure_args,
+            self.global_failure_args,
+            is_blocking=True,
+        )
+
+        raise Exception(self.failure_msgs[failure[0]].format(*failure_args))
+
+
+def map_transpose_gpu2gpu(self, elem_size, dst, dst_offset, src, src_offset, k, n, m):
+    kernels = self.transpose_kernels[elem_size]
+    kernel = kernels["default"]
+    mulx = TR_BLOCK_DIM / n
+    muly = TR_BLOCK_DIM / m
+
+    group_dims = (TR_TILE_DIM, TR_TILE_DIM // TR_ELEMS_PER_THREAD, 1)
+    dims = (
+        (m + TR_TILE_DIM - 1) // TR_TILE_DIM * group_dims[0],
+        (n + TR_TILE_DIM - 1) // TR_TILE_DIM * group_dims[1],
+        k,
+    )
+
+    k32 = np.int32(k)
+    n32 = np.int32(n)
+    m32 = np.int32(m)
+    mulx32 = np.int32(mulx)
+    muly32 = np.int32(muly)
+
+    kernel.set_args(
+        cl.LocalMemory(TR_TILE_DIM * (TR_TILE_DIM + 1) * elem_size),
+        dst,
+        dst_offset,
+        src,
+        src_offset,
+        k32,
+        m32,
+        n32,
+        mulx32,
+        muly32,
+        np.int32(0),
+        np.int32(0),
+    )
+    cl.enqueue_nd_range_kernel(self.queue, kernel, dims, group_dims)
+
+
+def copy_elements_gpu2gpu(
+    self,
+    elem_size,
+    dst,
+    dst_offset,
+    dst_strides,
+    src,
+    src_offset,
+    src_strides,
+    shape,
+):
+    r = len(shape)
+    if r > 8:
+        raise Exception("Futhark runtime limitation:\nCannot copy array of greater than rank 8.\n")
+
+    n = np.prod(shape)
+    zero = np.int64(0)
+    layout_args = [None] * (8 * 3)
+    for i in range(8):
+        if i < r:
+            layout_args[i * 3 + 0] = shape[i]
+            layout_args[i * 3 + 1] = dst_strides[i]
+            layout_args[i * 3 + 2] = src_strides[i]
+        else:
+            layout_args[i * 3 + 0] = zero
+            layout_args[i * 3 + 1] = zero
+            layout_args[i * 3 + 2] = zero
+
+    kernel = self.copy_kernels[elem_size]
+    kernel.set_args(
+        cl.LocalMemory(1),
+        dst,
+        dst_offset,
+        src,
+        src_offset,
+        n,
+        np.int32(r),
+        *layout_args,
+    )
+    w = 256
+    dims = ((n + w - 1) // w * w,)
+    group_dims = (w,)
+    cl.enqueue_nd_range_kernel(self.queue, kernel, dims, group_dims)
+
+
+def lmad_copy_gpu2gpu(self, pt, dst, dst_offset, dst_strides, src, src_offset, src_strides, shape):
+    elem_size = ct.sizeof(pt)
+    nbytes = np.prod(shape) * elem_size
+    if nbytes == 0:
+        return None
+    if lmad_memcpyable(dst_strides, src_strides, shape):
+        cl.enqueue_copy(
+            self.queue,
+            dst,
+            src,
+            dst_offset=dst_offset * elem_size,
+            src_offset=src_offset * elem_size,
+            byte_count=nbytes,
+        )
+    else:
+        tr = lmad_map_tr(dst_strides, src_strides, shape)
+        if tr is not None:
+            (k, n, m) = tr
+            map_transpose_gpu2gpu(self, elem_size, dst, dst_offset, src, src_offset, k, m, n)
+        else:
+            copy_elements_gpu2gpu(
+                self,
+                elem_size,
+                dst,
+                dst_offset,
+                dst_strides,
+                src,
+                src_offset,
+                src_strides,
+                shape,
+            )
+
+
+import pyopencl.array
+import time
+
+sizes = {}
+synchronous = False
+preferred_platform = None
+build_options = []
+preferred_device = None
+default_threshold = None
+default_group_size = None
+default_num_groups = None
+default_tile_size = None
+default_reg_tile_size = None
+fut_opencl_src = """#define FUTHARK_OPENCL
+// Start of prelude.cl
+
+#define SCALAR_FUN_ATTR static inline
+#define FUTHARK_FUN_ATTR static
+
+typedef char int8_t;
+typedef short int16_t;
+typedef int int32_t;
+typedef long int64_t;
+
+typedef uchar uint8_t;
+typedef ushort uint16_t;
+typedef uint uint32_t;
+typedef ulong uint64_t;
+
+#define get_tblock_id(d) get_group_id(d)
+#define get_num_tblocks(d) get_num_groups(d)
+
+// Clang-based OpenCL implementations need this for 'static' to work.
+#ifdef cl_clang_storage_class_specifiers
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+#ifdef FUTHARK_F64_ENABLED
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+// NVIDIAs OpenCL does not create device-wide memory fences (see #734), so we
+// use inline assembly if we detect we are on an NVIDIA GPU.
+#ifdef cl_nv_pragma_unroll
+static inline void mem_fence_global() {
+  asm("membar.gl;");
+}
+#else
+static inline void mem_fence_global() {
+  mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+#endif
+static inline void mem_fence_local() {
+  mem_fence(CLK_LOCAL_MEM_FENCE);
+}
+
+static inline void barrier_local() {
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Important for this to be int64_t so it has proper alignment for any type.
+#define SHARED_MEM_PARAM __local uint64_t* shared_mem,
+#define FUTHARK_KERNEL __kernel
+#define FUTHARK_KERNEL_SIZED(a,b,c) __attribute__((reqd_work_group_size(a, b, c))) __kernel
+
+// End of prelude.cl
+// Start of half.h.
+
+// Conversion functions are from http://half.sourceforge.net/, but
+// translated to C.
+//
+// Copyright (c) 2012-2021 Christian Rau
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef __OPENCL_VERSION__
+#define __constant
+#endif
+
+__constant static const uint16_t base_table[512] = {
+  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+  0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+  0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
+  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+  0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+  0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
+  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };
+
+__constant static const unsigned char shift_table[512] = {
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
+
+__constant static const uint32_t mantissa_table[2048] = {
+  0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
+  0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
+  0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
+  0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
+  0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
+  0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
+  0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
+  0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
+  0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
+  0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
+  0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
+  0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
+  0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
+  0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
+  0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
+  0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
+  0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
+  0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
+  0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
+  0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
+  0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
+  0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
+  0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
+  0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
+  0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
+  0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
+  0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
+  0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
+  0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
+  0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
+  0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
+  0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
+  0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
+  0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
+  0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
+  0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
+  0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
+  0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
+  0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
+  0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
+  0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
+  0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
+  0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
+  0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
+  0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
+  0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
+  0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
+  0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
+  0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
+  0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
+  0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
+  0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
+  0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
+  0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
+  0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
+  0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
+  0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
+  0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
+  0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
+  0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
+  0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
+  0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
+  0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
+  0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
+  0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
+  0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
+  0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
+  0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
+  0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
+  0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
+  0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
+  0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
+  0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
+  0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
+  0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
+  0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
+  0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
+  0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
+  0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
+  0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
+  0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
+  0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
+  0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
+  0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
+  0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
+  0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
+  0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
+  0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
+  0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
+  0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
+  0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
+  0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
+  0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
+  0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
+  0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
+  0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
+  0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
+  0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
+  0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
+  0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
+  0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
+  0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
+  0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
+  0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
+  0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+  0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
+  0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
+  0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
+  0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
+  0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
+  0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
+  0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
+  0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
+  0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
+  0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
+  0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
+  0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
+  0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
+  0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
+  0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
+  0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
+  0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
+  0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
+  0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
+  0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
+  0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
+  0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
+  0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
+__constant static const uint32_t exponent_table[64] = {
+  0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
+  0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
+  0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+  0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
+__constant static const unsigned short offset_table[64] = {
+  0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+  0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
+
+SCALAR_FUN_ATTR uint16_t float2halfbits(float value) {
+  union { float x; uint32_t y; } u;
+  u.x = value;
+  uint32_t bits = u.y;
+
+  uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);;
+
+  return hbits;
+}
+
+SCALAR_FUN_ATTR float halfbits2float(uint16_t value) {
+  uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
+
+  union { uint32_t x; float y; } u;
+  u.x = bits;
+  return u.y;
+}
+
+SCALAR_FUN_ATTR uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {
+  int fabs = from & 0x7FFF, tabs = to & 0x7FFF;
+  if(fabs > 0x7C00 || tabs > 0x7C00) {
+    return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200);
+  }
+  if(from == to || !(fabs|tabs)) {
+    return to;
+  }
+  if(!fabs) {
+    return (to&0x8000)+1;
+  }
+  unsigned int out =
+    from +
+    (((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1)
+    - 1;
+  return out;
+}
+
+// End of half.h.
+// Start of scalar.h.
+
+// Implementation of the primitive scalar operations.  Very
+// repetitive.  This code is inserted directly into both CUDA and
+// OpenCL programs, as well as the CPU code, so it has some #ifdefs to
+// work everywhere.  Some operations are defined as macros because
+// this allows us to use them as constant expressions in things like
+// array sizes and static initialisers.
+
+// Some of the #ifdefs are because OpenCL uses type-generic functions
+// for some operations (e.g. sqrt), while C and CUDA sensibly use
+// distinct functions for different precisions (e.g. sqrtf() and
+// sqrt()).  This is quite annoying.  Due to C's unfortunate casting
+// rules, it is also really easy to accidentally implement
+// floating-point functions in the wrong precision, so be careful.
+
+// Double-precision definitions are only included if the preprocessor
+// macro FUTHARK_F64_ENABLED is set.
+
+SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x);
+SCALAR_FUN_ATTR float futrts_from_bits32(int32_t x);
+
+SCALAR_FUN_ATTR uint8_t add8(uint8_t x, uint8_t y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR uint16_t add16(uint16_t x, uint16_t y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR uint32_t add32(uint32_t x, uint32_t y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR uint64_t add64(uint64_t x, uint64_t y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR uint8_t sub8(uint8_t x, uint8_t y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR uint16_t sub16(uint16_t x, uint16_t y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR uint32_t sub32(uint32_t x, uint32_t y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR uint64_t sub64(uint64_t x, uint64_t y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR uint8_t mul8(uint8_t x, uint8_t y) {
+  return x * y;
+}
+
+SCALAR_FUN_ATTR uint16_t mul16(uint16_t x, uint16_t y) {
+  return x * y;
+}
+
+SCALAR_FUN_ATTR uint32_t mul32(uint32_t x, uint32_t y) {
+  return x * y;
+}
+
+SCALAR_FUN_ATTR uint64_t mul64(uint64_t x, uint64_t y) {
+  return x * y;
+}
+
+#if ISPC
+
+SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) {
+  // This strange pattern is used to prevent the ISPC compiler from
+  // causing SIGFPEs and bogus results on divisions where inactive lanes
+  // have 0-valued divisors. It ensures that any inactive lane instead
+  // has a divisor of 1. https://github.com/ispc/ispc/issues/2292
+  uint8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv16(uint16_t x, uint16_t y) {
+  uint16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv32(uint32_t x, uint32_t y) {
+  uint32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv64(uint64_t x, uint64_t y) {
+  uint64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR uint8_t udiv_up8(uint8_t x, uint8_t y) {
+  uint8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+
+  return (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv_up16(uint16_t x, uint16_t y) {
+  uint16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv_up32(uint32_t x, uint32_t y) {
+  uint32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv_up64(uint64_t x, uint64_t y) {
+  uint64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint8_t umod8(uint8_t x, uint8_t y) {
+  uint8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR uint16_t umod16(uint16_t x, uint16_t y) {
+  uint16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR uint32_t umod32(uint32_t x, uint32_t y) {
+  uint32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR uint64_t umod64(uint64_t x, uint64_t y) {
+  uint64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR uint8_t udiv_safe8(uint8_t x, uint8_t y) {
+  uint8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv_safe16(uint16_t x, uint16_t y) {
+  uint16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv_safe32(uint32_t x, uint32_t y) {
+  uint32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv_safe64(uint64_t x, uint64_t y) {
+  uint64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {
+  uint8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {
+  uint16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {
+  uint32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {
+  uint64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : (x + y - 1) / ys;
+}
+
+SCALAR_FUN_ATTR uint8_t umod_safe8(uint8_t x, uint8_t y) {
+  uint8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+SCALAR_FUN_ATTR uint16_t umod_safe16(uint16_t x, uint16_t y) {
+  uint16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+SCALAR_FUN_ATTR uint32_t umod_safe32(uint32_t x, uint32_t y) {
+  uint32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+SCALAR_FUN_ATTR uint64_t umod_safe64(uint64_t x, uint64_t y) {
+  uint64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+SCALAR_FUN_ATTR int8_t sdiv8(int8_t x, int8_t y) {
+  int8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  int8_t q = x / ys;
+  int8_t r = x % ys;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv16(int16_t x, int16_t y) {
+  int16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  int16_t q = x / ys;
+  int16_t r = x % ys;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv32(int32_t x, int32_t y) {
+  int32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+  int32_t q = x / ys;
+  int32_t r = x % ys;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv64(int64_t x, int64_t y) {
+  int64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  int64_t q = x / ys;
+  int64_t r = x % ys;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int8_t sdiv_up8(int8_t x, int8_t y) {
+  return sdiv8(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv_up16(int16_t x, int16_t y) {
+  return sdiv16(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv_up32(int32_t x, int32_t y) {
+  return sdiv32(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv_up64(int64_t x, int64_t y) {
+  return sdiv64(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int8_t smod8(int8_t x, int8_t y) {
+  int8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  int8_t r = x % ys;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int16_t smod16(int16_t x, int16_t y) {
+  int16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  int16_t r = x % ys;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int32_t smod32(int32_t x, int32_t y) {
+  int32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  int32_t r = x % ys;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int64_t smod64(int64_t x, int64_t y) {
+  int64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  int64_t r = x % ys;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int8_t sdiv_safe8(int8_t x, int8_t y) {
+  return y == 0 ? 0 : sdiv8(x, y);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv_safe16(int16_t x, int16_t y) {
+  return y == 0 ? 0 : sdiv16(x, y);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv_safe32(int32_t x, int32_t y) {
+  return y == 0 ? 0 : sdiv32(x, y);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv_safe64(int64_t x, int64_t y) {
+  return y == 0 ? 0 : sdiv64(x, y);
+}
+
+SCALAR_FUN_ATTR int8_t sdiv_up_safe8(int8_t x, int8_t y) {
+  return sdiv_safe8(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv_up_safe16(int16_t x, int16_t y) {
+  return sdiv_safe16(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv_up_safe32(int32_t x, int32_t y) {
+  return sdiv_safe32(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv_up_safe64(int64_t x, int64_t y) {
+  return sdiv_safe64(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int8_t smod_safe8(int8_t x, int8_t y) {
+  return y == 0 ? 0 : smod8(x, y);
+}
+
+SCALAR_FUN_ATTR int16_t smod_safe16(int16_t x, int16_t y) {
+  return y == 0 ? 0 : smod16(x, y);
+}
+
+SCALAR_FUN_ATTR int32_t smod_safe32(int32_t x, int32_t y) {
+  return y == 0 ? 0 : smod32(x, y);
+}
+
+SCALAR_FUN_ATTR int64_t smod_safe64(int64_t x, int64_t y) {
+  return y == 0 ? 0 : smod64(x, y);
+}
+
+SCALAR_FUN_ATTR int8_t squot8(int8_t x, int8_t y) {
+  int8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR int16_t squot16(int16_t x, int16_t y) {
+  int16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR int32_t squot32(int32_t x, int32_t y) {
+  int32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR int64_t squot64(int64_t x, int64_t y) {
+  int64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x / ys;
+}
+
+SCALAR_FUN_ATTR int8_t srem8(int8_t x, int8_t y) {
+  int8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR int16_t srem16(int16_t x, int16_t y) {
+  int16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR int32_t srem32(int32_t x, int32_t y) {
+  int32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR int64_t srem64(int64_t x, int64_t y) {
+  int8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return x % ys;
+}
+
+SCALAR_FUN_ATTR int8_t squot_safe8(int8_t x, int8_t y) {
+  int8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR int16_t squot_safe16(int16_t x, int16_t y) {
+  int16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR int32_t squot_safe32(int32_t x, int32_t y) {
+  int32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR int64_t squot_safe64(int64_t x, int64_t y) {
+  int64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x / ys;
+}
+
+SCALAR_FUN_ATTR int8_t srem_safe8(int8_t x, int8_t y) {
+  int8_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+SCALAR_FUN_ATTR int16_t srem_safe16(int16_t x, int16_t y) {
+  int16_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+SCALAR_FUN_ATTR int32_t srem_safe32(int32_t x, int32_t y) {
+  int32_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+SCALAR_FUN_ATTR int64_t srem_safe64(int64_t x, int64_t y) {
+  int64_t ys = 1;
+  foreach_active(i){
+    ys = y;
+  }
+
+  return y == 0 ? 0 : x % ys;
+}
+
+#else
+
+SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv16(uint16_t x, uint16_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv32(uint32_t x, uint32_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv64(uint64_t x, uint64_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR uint8_t udiv_up8(uint8_t x, uint8_t y) {
+  return (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv_up16(uint16_t x, uint16_t y) {
+  return (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv_up32(uint32_t x, uint32_t y) {
+  return (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv_up64(uint64_t x, uint64_t y) {
+  return (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint8_t umod8(uint8_t x, uint8_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR uint16_t umod16(uint16_t x, uint16_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR uint32_t umod32(uint32_t x, uint32_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR uint64_t umod64(uint64_t x, uint64_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR uint8_t udiv_safe8(uint8_t x, uint8_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv_safe16(uint16_t x, uint16_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv_safe32(uint32_t x, uint32_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv_safe64(uint64_t x, uint64_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {
+  return y == 0 ? 0 : (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {
+  return y == 0 ? 0 : (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {
+  return y == 0 ? 0 : (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {
+  return y == 0 ? 0 : (x + y - 1) / y;
+}
+
+SCALAR_FUN_ATTR uint8_t umod_safe8(uint8_t x, uint8_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+SCALAR_FUN_ATTR uint16_t umod_safe16(uint16_t x, uint16_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+SCALAR_FUN_ATTR uint32_t umod_safe32(uint32_t x, uint32_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+SCALAR_FUN_ATTR uint64_t umod_safe64(uint64_t x, uint64_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+SCALAR_FUN_ATTR int8_t sdiv8(int8_t x, int8_t y) {
+  int8_t q = x / y;
+  int8_t r = x % y;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv16(int16_t x, int16_t y) {
+  int16_t q = x / y;
+  int16_t r = x % y;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv32(int32_t x, int32_t y) {
+  int32_t q = x / y;
+  int32_t r = x % y;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv64(int64_t x, int64_t y) {
+  int64_t q = x / y;
+  int64_t r = x % y;
+
+  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+
+SCALAR_FUN_ATTR int8_t sdiv_up8(int8_t x, int8_t y) {
+  return sdiv8(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv_up16(int16_t x, int16_t y) {
+  return sdiv16(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv_up32(int32_t x, int32_t y) {
+  return sdiv32(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv_up64(int64_t x, int64_t y) {
+  return sdiv64(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int8_t smod8(int8_t x, int8_t y) {
+  int8_t r = x % y;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int16_t smod16(int16_t x, int16_t y) {
+  int16_t r = x % y;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int32_t smod32(int32_t x, int32_t y) {
+  int32_t r = x % y;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int64_t smod64(int64_t x, int64_t y) {
+  int64_t r = x % y;
+
+  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+
+SCALAR_FUN_ATTR int8_t sdiv_safe8(int8_t x, int8_t y) {
+  return y == 0 ? 0 : sdiv8(x, y);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv_safe16(int16_t x, int16_t y) {
+  return y == 0 ? 0 : sdiv16(x, y);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv_safe32(int32_t x, int32_t y) {
+  return y == 0 ? 0 : sdiv32(x, y);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv_safe64(int64_t x, int64_t y) {
+  return y == 0 ? 0 : sdiv64(x, y);
+}
+
+SCALAR_FUN_ATTR int8_t sdiv_up_safe8(int8_t x, int8_t y) {
+  return sdiv_safe8(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int16_t sdiv_up_safe16(int16_t x, int16_t y) {
+  return sdiv_safe16(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int32_t sdiv_up_safe32(int32_t x, int32_t y) {
+  return sdiv_safe32(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int64_t sdiv_up_safe64(int64_t x, int64_t y) {
+  return sdiv_safe64(x + y - 1, y);
+}
+
+SCALAR_FUN_ATTR int8_t smod_safe8(int8_t x, int8_t y) {
+  return y == 0 ? 0 : smod8(x, y);
+}
+
+SCALAR_FUN_ATTR int16_t smod_safe16(int16_t x, int16_t y) {
+  return y == 0 ? 0 : smod16(x, y);
+}
+
+SCALAR_FUN_ATTR int32_t smod_safe32(int32_t x, int32_t y) {
+  return y == 0 ? 0 : smod32(x, y);
+}
+
+SCALAR_FUN_ATTR int64_t smod_safe64(int64_t x, int64_t y) {
+  return y == 0 ? 0 : smod64(x, y);
+}
+
+SCALAR_FUN_ATTR int8_t squot8(int8_t x, int8_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR int16_t squot16(int16_t x, int16_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR int32_t squot32(int32_t x, int32_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR int64_t squot64(int64_t x, int64_t y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR int8_t srem8(int8_t x, int8_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR int16_t srem16(int16_t x, int16_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR int32_t srem32(int32_t x, int32_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR int64_t srem64(int64_t x, int64_t y) {
+  return x % y;
+}
+
+SCALAR_FUN_ATTR int8_t squot_safe8(int8_t x, int8_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR int16_t squot_safe16(int16_t x, int16_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR int32_t squot_safe32(int32_t x, int32_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR int64_t squot_safe64(int64_t x, int64_t y) {
+  return y == 0 ? 0 : x / y;
+}
+
+SCALAR_FUN_ATTR int8_t srem_safe8(int8_t x, int8_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+SCALAR_FUN_ATTR int16_t srem_safe16(int16_t x, int16_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+SCALAR_FUN_ATTR int32_t srem_safe32(int32_t x, int32_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+SCALAR_FUN_ATTR int64_t srem_safe64(int64_t x, int64_t y) {
+  return y == 0 ? 0 : x % y;
+}
+
+#endif
+
+SCALAR_FUN_ATTR int8_t smin8(int8_t x, int8_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR int16_t smin16(int16_t x, int16_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR int32_t smin32(int32_t x, int32_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR int64_t smin64(int64_t x, int64_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR uint8_t umin8(uint8_t x, uint8_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR uint16_t umin16(uint16_t x, uint16_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR uint32_t umin32(uint32_t x, uint32_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR uint64_t umin64(uint64_t x, uint64_t y) {
+  return x < y ? x : y;
+}
+
+SCALAR_FUN_ATTR int8_t smax8(int8_t x, int8_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR int16_t smax16(int16_t x, int16_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR int32_t smax32(int32_t x, int32_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR int64_t smax64(int64_t x, int64_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR uint8_t umax8(uint8_t x, uint8_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR uint16_t umax16(uint16_t x, uint16_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR uint32_t umax32(uint32_t x, uint32_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR uint64_t umax64(uint64_t x, uint64_t y) {
+  return x < y ? y : x;
+}
+
+SCALAR_FUN_ATTR uint8_t shl8(uint8_t x, uint8_t y) {
+  return (uint8_t)(x << y);
+}
+
+SCALAR_FUN_ATTR uint16_t shl16(uint16_t x, uint16_t y) {
+  return (uint16_t)(x << y);
+}
+
+SCALAR_FUN_ATTR uint32_t shl32(uint32_t x, uint32_t y) {
+  return x << y;
+}
+
+SCALAR_FUN_ATTR uint64_t shl64(uint64_t x, uint64_t y) {
+  return x << y;
+}
+
+SCALAR_FUN_ATTR uint8_t lshr8(uint8_t x, uint8_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR uint16_t lshr16(uint16_t x, uint16_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR uint32_t lshr32(uint32_t x, uint32_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR uint64_t lshr64(uint64_t x, uint64_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR int8_t ashr8(int8_t x, int8_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR int16_t ashr16(int16_t x, int16_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR int32_t ashr32(int32_t x, int32_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR int64_t ashr64(int64_t x, int64_t y) {
+  return x >> y;
+}
+
+SCALAR_FUN_ATTR uint8_t and8(uint8_t x, uint8_t y) {
+  return x & y;
+}
+
+SCALAR_FUN_ATTR uint16_t and16(uint16_t x, uint16_t y) {
+  return x & y;
+}
+
+SCALAR_FUN_ATTR uint32_t and32(uint32_t x, uint32_t y) {
+  return x & y;
+}
+
+SCALAR_FUN_ATTR uint64_t and64(uint64_t x, uint64_t y) {
+  return x & y;
+}
+
+SCALAR_FUN_ATTR uint8_t or8(uint8_t x, uint8_t y) {
+  return x | y;
+}
+
+SCALAR_FUN_ATTR uint16_t or16(uint16_t x, uint16_t y) {
+  return x | y;
+}
+
+SCALAR_FUN_ATTR uint32_t or32(uint32_t x, uint32_t y) {
+  return x | y;
+}
+
+SCALAR_FUN_ATTR uint64_t or64(uint64_t x, uint64_t y) {
+  return x | y;
+}
+
+SCALAR_FUN_ATTR uint8_t xor8(uint8_t x, uint8_t y) {
+  return x ^ y;
+}
+
+SCALAR_FUN_ATTR uint16_t xor16(uint16_t x, uint16_t y) {
+  return x ^ y;
+}
+
+SCALAR_FUN_ATTR uint32_t xor32(uint32_t x, uint32_t y) {
+  return x ^ y;
+}
+
+SCALAR_FUN_ATTR uint64_t xor64(uint64_t x, uint64_t y) {
+  return x ^ y;
+}
+
+SCALAR_FUN_ATTR bool ult8(uint8_t x, uint8_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool ult16(uint16_t x, uint16_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool ult32(uint32_t x, uint32_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool ult64(uint64_t x, uint64_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool ule8(uint8_t x, uint8_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR bool ule16(uint16_t x, uint16_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR bool ule32(uint32_t x, uint32_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR bool ule64(uint64_t x, uint64_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR bool slt8(int8_t x, int8_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool slt16(int16_t x, int16_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool slt32(int32_t x, int32_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool slt64(int64_t x, int64_t y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool sle8(int8_t x, int8_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR bool sle16(int16_t x, int16_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR bool sle32(int32_t x, int32_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR bool sle64(int64_t x, int64_t y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR uint8_t pow8(uint8_t x, uint8_t y) {
+  uint8_t res = 1, rem = y;
+
+  while (rem != 0) {
+    if (rem & 1)
+      res *= x;
+    rem >>= 1;
+    x *= x;
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR uint16_t pow16(uint16_t x, uint16_t y) {
+  uint16_t res = 1, rem = y;
+
+  while (rem != 0) {
+    if (rem & 1)
+      res *= x;
+    rem >>= 1;
+    x *= x;
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR uint32_t pow32(uint32_t x, uint32_t y) {
+  uint32_t res = 1, rem = y;
+
+  while (rem != 0) {
+    if (rem & 1)
+      res *= x;
+    rem >>= 1;
+    x *= x;
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR uint64_t pow64(uint64_t x, uint64_t y) {
+  uint64_t res = 1, rem = y;
+
+  while (rem != 0) {
+    if (rem & 1)
+      res *= x;
+    rem >>= 1;
+    x *= x;
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR bool itob_i8_bool(int8_t x) {
+  return x != 0;
+}
+
+SCALAR_FUN_ATTR bool itob_i16_bool(int16_t x) {
+  return x != 0;
+}
+
+SCALAR_FUN_ATTR bool itob_i32_bool(int32_t x) {
+  return x != 0;
+}
+
+SCALAR_FUN_ATTR bool itob_i64_bool(int64_t x) {
+  return x != 0;
+}
+
+SCALAR_FUN_ATTR int8_t btoi_bool_i8(bool x) {
+  return x;
+}
+
+SCALAR_FUN_ATTR int16_t btoi_bool_i16(bool x) {
+  return x;
+}
+
+SCALAR_FUN_ATTR int32_t btoi_bool_i32(bool x) {
+  return x;
+}
+
+SCALAR_FUN_ATTR int64_t btoi_bool_i64(bool x) {
+  return x;
+}
+
+#define sext_i8_i8(x) ((int8_t) (int8_t) (x))
+#define sext_i8_i16(x) ((int16_t) (int8_t) (x))
+#define sext_i8_i32(x) ((int32_t) (int8_t) (x))
+#define sext_i8_i64(x) ((int64_t) (int8_t) (x))
+#define sext_i16_i8(x) ((int8_t) (int16_t) (x))
+#define sext_i16_i16(x) ((int16_t) (int16_t) (x))
+#define sext_i16_i32(x) ((int32_t) (int16_t) (x))
+#define sext_i16_i64(x) ((int64_t) (int16_t) (x))
+#define sext_i32_i8(x) ((int8_t) (int32_t) (x))
+#define sext_i32_i16(x) ((int16_t) (int32_t) (x))
+#define sext_i32_i32(x) ((int32_t) (int32_t) (x))
+#define sext_i32_i64(x) ((int64_t) (int32_t) (x))
+#define sext_i64_i8(x) ((int8_t) (int64_t) (x))
+#define sext_i64_i16(x) ((int16_t) (int64_t) (x))
+#define sext_i64_i32(x) ((int32_t) (int64_t) (x))
+#define sext_i64_i64(x) ((int64_t) (int64_t) (x))
+#define zext_i8_i8(x) ((int8_t) (uint8_t) (x))
+#define zext_i8_i16(x) ((int16_t) (uint8_t) (x))
+#define zext_i8_i32(x) ((int32_t) (uint8_t) (x))
+#define zext_i8_i64(x) ((int64_t) (uint8_t) (x))
+#define zext_i16_i8(x) ((int8_t) (uint16_t) (x))
+#define zext_i16_i16(x) ((int16_t) (uint16_t) (x))
+#define zext_i16_i32(x) ((int32_t) (uint16_t) (x))
+#define zext_i16_i64(x) ((int64_t) (uint16_t) (x))
+#define zext_i32_i8(x) ((int8_t) (uint32_t) (x))
+#define zext_i32_i16(x) ((int16_t) (uint32_t) (x))
+#define zext_i32_i32(x) ((int32_t) (uint32_t) (x))
+#define zext_i32_i64(x) ((int64_t) (uint32_t) (x))
+#define zext_i64_i8(x) ((int8_t) (uint64_t) (x))
+#define zext_i64_i16(x) ((int16_t) (uint64_t) (x))
+#define zext_i64_i32(x) ((int32_t) (uint64_t) (x))
+#define zext_i64_i64(x) ((int64_t) (uint64_t) (x))
+
+SCALAR_FUN_ATTR int8_t abs8(int8_t x) {
+  return (int8_t)abs(x);
+}
+
+SCALAR_FUN_ATTR int16_t abs16(int16_t x) {
+  return (int16_t)abs(x);
+}
+
+SCALAR_FUN_ATTR int32_t abs32(int32_t x) {
+  return abs(x);
+}
+
+SCALAR_FUN_ATTR int64_t abs64(int64_t x) {
+#if defined(__OPENCL_VERSION__) || defined(ISPC)
+  return abs(x);
+#else
+  return llabs(x);
+#endif
+}
+
+#if defined(__OPENCL_VERSION__)
+SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) {
+  return popcount(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) {
+  return popcount(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) {
+  return popcount(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) {
+  return popcount(x);
+}
+#elif defined(__CUDA_ARCH__)
+
+SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) {
+  return __popc(zext_i8_i32(x));
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) {
+  return __popc(zext_i16_i32(x));
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) {
+  return __popc(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) {
+  return __popcll(x);
+}
+
+#else // Not OpenCL or CUDA, but plain C.
+
+SCALAR_FUN_ATTR int32_t futrts_popc8(uint8_t x) {
+  int c = 0;
+  for (; x; ++c) { x &= x - 1; }
+  return c;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc16(uint16_t x) {
+  int c = 0;
+  for (; x; ++c) { x &= x - 1; }
+  return c;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc32(uint32_t x) {
+  int c = 0;
+  for (; x; ++c) { x &= x - 1; }
+  return c;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_popc64(uint64_t x) {
+  int c = 0;
+  for (; x; ++c) { x &= x - 1; }
+  return c;
+}
+#endif
+
+#if defined(__OPENCL_VERSION__)
+SCALAR_FUN_ATTR uint8_t  futrts_umul_hi8 ( uint8_t a,  uint8_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint8_t  futrts_smul_hi8 ( int8_t a,  int8_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }
+#elif defined(__CUDA_ARCH__)
+SCALAR_FUN_ATTR  uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }
+SCALAR_FUN_ATTR  uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }
+#elif ISPC
+SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {
+  uint64_t ah = a >> 32;
+  uint64_t al = a & 0xffffffff;
+  uint64_t bh = b >> 32;
+  uint64_t bl = b & 0xffffffff;
+
+  uint64_t p1 = al * bl;
+  uint64_t p2 = al * bh;
+  uint64_t p3 = ah * bl;
+  uint64_t p4 = ah * bh;
+
+  uint64_t p1h = p1 >> 32;
+  uint64_t p2h = p2 >> 32;
+  uint64_t p3h = p3 >> 32;
+  uint64_t p2l = p2 & 0xffffffff;
+  uint64_t p3l = p3 & 0xffffffff;
+
+  uint64_t l = p1h + p2l + p3l;
+  uint64_t m = (p2 >> 32) + (p3 >> 32);
+  uint64_t h = (l >> 32) + m + p4;
+
+  return h;
+}
+SCALAR_FUN_ATTR  int8_t futrts_smul_hi8 ( int8_t a,  int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
+SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) {
+  uint64_t ah = a >> 32;
+  uint64_t al = a & 0xffffffff;
+  uint64_t bh = b >> 32;
+  uint64_t bl = b & 0xffffffff;
+
+  uint64_t p1 =  al * bl;
+  int64_t  p2 = al * bh;
+  int64_t  p3 = ah * bl;
+  uint64_t p4 =  ah * bh;
+
+  uint64_t p1h = p1 >> 32;
+  uint64_t p2h = p2 >> 32;
+  uint64_t p3h = p3 >> 32;
+  uint64_t p2l = p2 & 0xffffffff;
+  uint64_t p3l = p3 & 0xffffffff;
+
+  uint64_t l = p1h + p2l + p3l;
+  uint64_t m = (p2 >> 32) + (p3 >> 32);
+  uint64_t h = (l >> 32) + m + p4;
+
+  return h;
+}
+
+#else // Not OpenCL, ISPC, or CUDA, but plain C.
+SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }
+SCALAR_FUN_ATTR int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
+SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
+SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }
+SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }
+#endif
+
+#if defined(__OPENCL_VERSION__)
+SCALAR_FUN_ATTR  uint8_t futrts_umad_hi8 ( uint8_t a,  uint8_t b,  uint8_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR  uint8_t futrts_smad_hi8( int8_t a,  int8_t b,   int8_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }
+#else // Not OpenCL
+
+SCALAR_FUN_ATTR  uint8_t futrts_umad_hi8( uint8_t a,  uint8_t b,  uint8_t c) { return futrts_umul_hi8(a, b) + c; }
+SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }
+SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }
+SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }
+SCALAR_FUN_ATTR  uint8_t futrts_smad_hi8 ( int8_t a,  int8_t b,  int8_t c) { return futrts_smul_hi8(a, b) + c; }
+SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }
+SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }
+SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }
+#endif
+
+#if defined(__OPENCL_VERSION__)
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
+  return clz(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
+  return clz(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
+  return clz(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
+  return clz(x);
+}
+
+#elif defined(__CUDA_ARCH__)
+
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
+  return __clz(zext_i8_i32(x)) - 24;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
+  return __clz(zext_i16_i32(x)) - 16;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
+  return __clz(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
+  return __clzll(x);
+}
+
+#elif ISPC
+
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
+  return count_leading_zeros((int32_t)(uint8_t)x)-24;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
+  return count_leading_zeros((int32_t)(uint16_t)x)-16;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
+  return count_leading_zeros(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
+  return count_leading_zeros(x);
+}
+
+#else // Not OpenCL, ISPC or CUDA, but plain C.
+
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
+  return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
+  return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
+  return x == 0 ? 32 : __builtin_clz((uint32_t)x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
+  return x == 0 ? 64 : __builtin_clzll((uint64_t)x);
+}
+#endif
+
+#if defined(__OPENCL_VERSION__)
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
+  int i = 0;
+  for (; i < 8 && (x & 1) == 0; i++, x >>= 1)
+    ;
+  return i;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
+  int i = 0;
+  for (; i < 16 && (x & 1) == 0; i++, x >>= 1)
+    ;
+  return i;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
+  int i = 0;
+  for (; i < 32 && (x & 1) == 0; i++, x >>= 1)
+    ;
+  return i;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
+  int i = 0;
+  for (; i < 64 && (x & 1) == 0; i++, x >>= 1)
+    ;
+  return i;
+}
+
+#elif defined(__CUDA_ARCH__)
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
+  int y = __ffs(x);
+  return y == 0 ? 8 : y - 1;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
+  int y = __ffs(x);
+  return y == 0 ? 16 : y - 1;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
+  int y = __ffs(x);
+  return y == 0 ? 32 : y - 1;
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
+  int y = __ffsll(x);
+  return y == 0 ? 64 : y - 1;
+}
+
+#elif ISPC
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
+  return x == 0 ? 8 : count_trailing_zeros((int32_t)x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
+  return x == 0 ? 16 : count_trailing_zeros((int32_t)x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
+  return count_trailing_zeros(x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
+  return count_trailing_zeros(x);
+}
+
+#else // Not OpenCL or CUDA, but plain C.
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
+  return x == 0 ? 8 : __builtin_ctz((uint32_t)x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
+  return x == 0 ? 16 : __builtin_ctz((uint32_t)x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
+  return x == 0 ? 32 : __builtin_ctz((uint32_t)x);
+}
+
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
+  return x == 0 ? 64 : __builtin_ctzll((uint64_t)x);
+}
+#endif
+
+SCALAR_FUN_ATTR float fdiv32(float x, float y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR float fadd32(float x, float y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR float fsub32(float x, float y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR float fmul32(float x, float y) {
+  return x * y;
+}
+
+SCALAR_FUN_ATTR bool cmplt32(float x, float y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool cmple32(float x, float y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR float sitofp_i8_f32(int8_t x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR float sitofp_i16_f32(int16_t x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR float sitofp_i32_f32(int32_t x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR float sitofp_i64_f32(int64_t x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR float uitofp_i8_f32(uint8_t x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR float uitofp_i16_f32(uint16_t x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR float uitofp_i32_f32(uint32_t x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR float uitofp_i64_f32(uint64_t x) {
+  return (float) x;
+}
+
+#ifdef __OPENCL_VERSION__
+SCALAR_FUN_ATTR float fabs32(float x) {
+  return fabs(x);
+}
+
+SCALAR_FUN_ATTR float fmax32(float x, float y) {
+  return fmax(x, y);
+}
+
+SCALAR_FUN_ATTR float fmin32(float x, float y) {
+  return fmin(x, y);
+}
+
+SCALAR_FUN_ATTR float fpow32(float x, float y) {
+  return pow(x, y);
+}
+
+#elif ISPC
+
+SCALAR_FUN_ATTR float fabs32(float x) {
+  return abs(x);
+}
+
+SCALAR_FUN_ATTR float fmax32(float x, float y) {
+  return isnan(x) ? y : isnan(y) ? x : max(x, y);
+}
+
+SCALAR_FUN_ATTR float fmin32(float x, float y) {
+  return isnan(x) ? y : isnan(y) ? x : min(x, y);
+}
+
+SCALAR_FUN_ATTR float fpow32(float a, float b) {
+  float ret;
+  foreach_active (i) {
+      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
+      ret = insert(ret, i, r);
+  }
+  return ret;
+}
+
+#else // Not OpenCL, but CUDA or plain C.
+
+SCALAR_FUN_ATTR float fabs32(float x) {
+  return fabsf(x);
+}
+
+SCALAR_FUN_ATTR float fmax32(float x, float y) {
+  return fmaxf(x, y);
+}
+
+SCALAR_FUN_ATTR float fmin32(float x, float y) {
+  return fminf(x, y);
+}
+
+SCALAR_FUN_ATTR float fpow32(float x, float y) {
+  return powf(x, y);
+}
+#endif
+
+SCALAR_FUN_ATTR bool futrts_isnan32(float x) {
+  return isnan(x);
+}
+
+#if ISPC
+
+SCALAR_FUN_ATTR bool futrts_isinf32(float x) {
+  return !isnan(x) && isnan(x - x);
+}
+
+SCALAR_FUN_ATTR bool futrts_isfinite32(float x) {
+  return !isnan(x) && !futrts_isinf32(x);
+}
+
+#else
+
+SCALAR_FUN_ATTR bool futrts_isinf32(float x) {
+  return isinf(x);
+}
+
+#endif
+
+SCALAR_FUN_ATTR int8_t fptosi_f32_i8(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (int8_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int16_t fptosi_f32_i16(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (int16_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int32_t fptosi_f32_i32(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (int32_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int64_t fptosi_f32_i64(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (int64_t) x;
+  };
+}
+
+SCALAR_FUN_ATTR uint8_t fptoui_f32_i8(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (uint8_t) (int8_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint16_t fptoui_f32_i16(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (uint16_t) (int16_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint32_t fptoui_f32_i32(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (uint32_t) (int32_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint64_t fptoui_f32_i64(float x) {
+  if (futrts_isnan32(x) || futrts_isinf32(x)) {
+    return 0;
+  } else {
+    return (uint64_t) (int64_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR bool ftob_f32_bool(float x) {
+  return x != 0;
+}
+
+SCALAR_FUN_ATTR float btof_bool_f32(bool x) {
+  return x ? 1 : 0;
+}
+
+#ifdef __OPENCL_VERSION__
+SCALAR_FUN_ATTR float futrts_log32(float x) {
+  return log(x);
+}
+
+SCALAR_FUN_ATTR float futrts_log2_32(float x) {
+  return log2(x);
+}
+
+SCALAR_FUN_ATTR float futrts_log10_32(float x) {
+  return log10(x);
+}
+
+SCALAR_FUN_ATTR float futrts_log1p_32(float x) {
+  return log1p(x);
+}
+
+SCALAR_FUN_ATTR float futrts_sqrt32(float x) {
+  return sqrt(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cbrt32(float x) {
+  return cbrt(x);
+}
+
+SCALAR_FUN_ATTR float futrts_exp32(float x) {
+  return exp(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cos32(float x) {
+  return cos(x);
+}
+
+SCALAR_FUN_ATTR float futrts_sin32(float x) {
+  return sin(x);
+}
+
+SCALAR_FUN_ATTR float futrts_tan32(float x) {
+  return tan(x);
+}
+
+SCALAR_FUN_ATTR float futrts_acos32(float x) {
+  return acos(x);
+}
+
+SCALAR_FUN_ATTR float futrts_asin32(float x) {
+  return asin(x);
+}
+
+SCALAR_FUN_ATTR float futrts_atan32(float x) {
+  return atan(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cosh32(float x) {
+  return cosh(x);
+}
+
+SCALAR_FUN_ATTR float futrts_sinh32(float x) {
+  return sinh(x);
+}
+
+SCALAR_FUN_ATTR float futrts_tanh32(float x) {
+  return tanh(x);
+}
+
+SCALAR_FUN_ATTR float futrts_acosh32(float x) {
+  return acosh(x);
+}
+
+SCALAR_FUN_ATTR float futrts_asinh32(float x) {
+  return asinh(x);
+}
+
+SCALAR_FUN_ATTR float futrts_atanh32(float x) {
+  return atanh(x);
+}
+
+SCALAR_FUN_ATTR float futrts_atan2_32(float x, float y) {
+  return atan2(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_hypot32(float x, float y) {
+  return hypot(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_gamma32(float x) {
+  return tgamma(x);
+}
+
+SCALAR_FUN_ATTR float futrts_lgamma32(float x) {
+  return lgamma(x);
+}
+
+SCALAR_FUN_ATTR float futrts_erf32(float x) {
+  return erf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_erfc32(float x) {
+  return erfc(x);
+}
+
+SCALAR_FUN_ATTR float fmod32(float x, float y) {
+  return fmod(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_round32(float x) {
+  return rint(x);
+}
+
+SCALAR_FUN_ATTR float futrts_floor32(float x) {
+  return floor(x);
+}
+
+SCALAR_FUN_ATTR float futrts_ceil32(float x) {
+  return ceil(x);
+}
+
+SCALAR_FUN_ATTR float futrts_nextafter32(float x, float y) {
+  return nextafter(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) {
+  return mix(v0, v1, t);
+}
+
+SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) {
+  return ldexp(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) {
+  return copysign(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_mad32(float a, float b, float c) {
+  return mad(a, b, c);
+}
+
+SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) {
+  return fma(a, b, c);
+}
+
+#elif ISPC
+
+SCALAR_FUN_ATTR float futrts_log32(float x) {
+  return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x;
+}
+
+SCALAR_FUN_ATTR float futrts_log2_32(float x) {
+  return futrts_log32(x) / log(2.0f);
+}
+
+SCALAR_FUN_ATTR float futrts_log10_32(float x) {
+  return futrts_log32(x) / log(10.0f);
+}
+
+SCALAR_FUN_ATTR float futrts_log1p_32(float x) {
+  if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f;
+  float y = 1.0f + x;
+  float z = y - 1.0f;
+  return log(y) - (z-x)/y;
+}
+
+SCALAR_FUN_ATTR float futrts_sqrt32(float x) {
+  return sqrt(x);
+}
+
+extern "C" unmasked uniform float cbrtf(uniform float);
+SCALAR_FUN_ATTR float futrts_cbrt32(float x) {
+  float res;
+  foreach_active (i) {
+    uniform float r = cbrtf(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR float futrts_exp32(float x) {
+  return exp(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cos32(float x) {
+  return cos(x);
+}
+
+SCALAR_FUN_ATTR float futrts_sin32(float x) {
+  return sin(x);
+}
+
+SCALAR_FUN_ATTR float futrts_tan32(float x) {
+  return tan(x);
+}
+
+SCALAR_FUN_ATTR float futrts_acos32(float x) {
+  return acos(x);
+}
+
+SCALAR_FUN_ATTR float futrts_asin32(float x) {
+  return asin(x);
+}
+
+SCALAR_FUN_ATTR float futrts_atan32(float x) {
+  return atan(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cosh32(float x) {
+  return (exp(x)+exp(-x)) / 2.0f;
+}
+
+SCALAR_FUN_ATTR float futrts_sinh32(float x) {
+  return (exp(x)-exp(-x)) / 2.0f;
+}
+
+SCALAR_FUN_ATTR float futrts_tanh32(float x) {
+  return futrts_sinh32(x)/futrts_cosh32(x);
+}
+
+SCALAR_FUN_ATTR float futrts_acosh32(float x) {
+  float f = x+sqrt(x*x-1);
+  if(futrts_isfinite32(f)) return log(f);
+  return f;
+}
+
+SCALAR_FUN_ATTR float futrts_asinh32(float x) {
+  float f = x+sqrt(x*x+1);
+  if(futrts_isfinite32(f)) return log(f);
+  return f;
+
+}
+
+SCALAR_FUN_ATTR float futrts_atanh32(float x) {
+  float f = (1+x)/(1-x);
+  if(futrts_isfinite32(f)) return log(f)/2.0f;
+  return f;
+
+}
+
+SCALAR_FUN_ATTR float futrts_atan2_32(float x, float y) {
+  return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_hypot32(float x, float y) {
+  if (futrts_isfinite32(x) && futrts_isfinite32(y)) {
+    x = abs(x);
+    y = abs(y);
+    float a;
+    float b;
+    if (x >= y){
+        a = x;
+        b = y;
+    } else {
+        a = y;
+        b = x;
+    }
+    if(b == 0){
+      return a;
+    }
+
+    int e;
+    float an;
+    float bn;
+    an = frexp (a, &e);
+    bn = ldexp (b, - e);
+    float cn;
+    cn = sqrt (an * an + bn * bn);
+    return ldexp (cn, e);
+  } else {
+    if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY;
+    else return x + y;
+  }
+
+}
+
+extern "C" unmasked uniform float tgammaf(uniform float x);
+SCALAR_FUN_ATTR float futrts_gamma32(float x) {
+  float res;
+  foreach_active (i) {
+    uniform float r = tgammaf(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform float lgammaf(uniform float x);
+SCALAR_FUN_ATTR float futrts_lgamma32(float x) {
+  float res;
+  foreach_active (i) {
+    uniform float r = lgammaf(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform float erff(uniform float x);
+SCALAR_FUN_ATTR float futrts_erf32(float x) {
+  float res;
+  foreach_active (i) {
+    uniform float r = erff(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform float erfcf(uniform float x);
+SCALAR_FUN_ATTR float futrts_erfc32(float x) {
+  float res;
+  foreach_active (i) {
+    uniform float r = erfcf(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR float fmod32(float x, float y) {
+  return x - y * trunc(x/y);
+}
+
+SCALAR_FUN_ATTR float futrts_round32(float x) {
+  return round(x);
+}
+
+SCALAR_FUN_ATTR float futrts_floor32(float x) {
+  return floor(x);
+}
+
+SCALAR_FUN_ATTR float futrts_ceil32(float x) {
+  return ceil(x);
+}
+
+extern "C" unmasked uniform float nextafterf(uniform float x, uniform float y);
+SCALAR_FUN_ATTR float futrts_nextafter32(float x, float y) {
+  float res;
+  foreach_active (i) {
+    uniform float r = nextafterf(extract(x, i), extract(y, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) {
+  return v0 + (v1 - v0) * t;
+}
+
+SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) {
+  return x * pow((double)2.0, (double)y);
+}
+
+SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) {
+  int32_t xb = futrts_to_bits32(x);
+  int32_t yb = futrts_to_bits32(y);
+  return futrts_from_bits32((xb & ~(1<<31)) | (yb & (1<<31)));
+}
+
+SCALAR_FUN_ATTR float futrts_mad32(float a, float b, float c) {
+  return a * b + c;
+}
+
+SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) {
+  return a * b + c;
+}
+
+#else // Not OpenCL or ISPC, but CUDA or plain C.
+
+SCALAR_FUN_ATTR float futrts_log32(float x) {
+  return logf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_log2_32(float x) {
+  return log2f(x);
+}
+
+SCALAR_FUN_ATTR float futrts_log10_32(float x) {
+  return log10f(x);
+}
+
+SCALAR_FUN_ATTR float futrts_log1p_32(float x) {
+  return log1pf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_sqrt32(float x) {
+  return sqrtf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cbrt32(float x) {
+  return cbrtf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_exp32(float x) {
+  return expf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cos32(float x) {
+  return cosf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_sin32(float x) {
+  return sinf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_tan32(float x) {
+  return tanf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_acos32(float x) {
+  return acosf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_asin32(float x) {
+  return asinf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_atan32(float x) {
+  return atanf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_cosh32(float x) {
+  return coshf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_sinh32(float x) {
+  return sinhf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_tanh32(float x) {
+  return tanhf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_acosh32(float x) {
+  return acoshf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_asinh32(float x) {
+  return asinhf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_atanh32(float x) {
+  return atanhf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_atan2_32(float x, float y) {
+  return atan2f(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_hypot32(float x, float y) {
+  return hypotf(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_gamma32(float x) {
+  return tgammaf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_lgamma32(float x) {
+  return lgammaf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_erf32(float x) {
+  return erff(x);
+}
+
+SCALAR_FUN_ATTR float futrts_erfc32(float x) {
+  return erfcf(x);
+}
+
+SCALAR_FUN_ATTR float fmod32(float x, float y) {
+  return fmodf(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_round32(float x) {
+  return rintf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_floor32(float x) {
+  return floorf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_ceil32(float x) {
+  return ceilf(x);
+}
+
+SCALAR_FUN_ATTR float futrts_nextafter32(float x, float y) {
+  return nextafterf(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) {
+  return v0 + (v1 - v0) * t;
+}
+
+SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) {
+  return ldexpf(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) {
+  return copysignf(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_mad32(float a, float b, float c) {
+  return a * b + c;
+}
+
+SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) {
+  return fmaf(a, b, c);
+}
+#endif
+
+#if ISPC
+SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x) {
+  return intbits(x);
+}
+
+SCALAR_FUN_ATTR float futrts_from_bits32(int32_t x) {
+  return floatbits(x);
+}
+#else
+SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x) {
+  union {
+    float f;
+    int32_t t;
+  } p;
+
+  p.f = x;
+  return p.t;
+}
+
+SCALAR_FUN_ATTR float futrts_from_bits32(int32_t x) {
+  union {
+    int32_t f;
+    float t;
+  } p;
+
+  p.f = x;
+  return p.t;
+}
+#endif
+
+SCALAR_FUN_ATTR float fsignum32(float x) {
+  return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);
+}
+
+#ifdef FUTHARK_F64_ENABLED
+
+SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x);
+SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x);
+
+#if ISPC
+SCALAR_FUN_ATTR bool futrts_isinf64(float x) {
+  return !isnan(x) && isnan(x - x);
+}
+
+SCALAR_FUN_ATTR bool futrts_isfinite64(float x) {
+  return !isnan(x) && !futrts_isinf64(x);
+}
+
+SCALAR_FUN_ATTR double fdiv64(double x, double y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR double fadd64(double x, double y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR double fsub64(double x, double y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR double fmul64(double x, double y) {
+  return x * y;
+}
+
+SCALAR_FUN_ATTR bool cmplt64(double x, double y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool cmple64(double x, double y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR double sitofp_i8_f64(int8_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double sitofp_i16_f64(int16_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double sitofp_i32_f64(int32_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double sitofp_i64_f64(int64_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i8_f64(uint8_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i16_f64(uint16_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i32_f64(uint32_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i64_f64(uint64_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double fabs64(double x) {
+  return abs(x);
+}
+
+SCALAR_FUN_ATTR double fmax64(double x, double y) {
+  return isnan(x) ? y : isnan(y) ? x : max(x, y);
+}
+
+SCALAR_FUN_ATTR double fmin64(double x, double y) {
+  return isnan(x) ? y : isnan(y) ? x : min(x, y);
+}
+
+SCALAR_FUN_ATTR double fpow64(double a, double b) {
+  float ret;
+  foreach_active (i) {
+      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
+      ret = insert(ret, i, r);
+  }
+  return ret;
+}
+
+SCALAR_FUN_ATTR double futrts_log64(double x) {
+  return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x;
+}
+
+SCALAR_FUN_ATTR double futrts_log2_64(double x) {
+  return futrts_log64(x)/log(2.0d);
+}
+
+SCALAR_FUN_ATTR double futrts_log10_64(double x) {
+  return futrts_log64(x)/log(10.0d);
+}
+
+SCALAR_FUN_ATTR double futrts_log1p_64(double x) {
+  if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d;
+  double y = 1.0d + x;
+  double z = y - 1.0d;
+  return log(y) - (z-x)/y;
+}
+
+SCALAR_FUN_ATTR double futrts_sqrt64(double x) {
+  return sqrt(x);
+}
+
+extern "C" unmasked uniform double cbrt(uniform double);
+SCALAR_FUN_ATTR double futrts_cbrt64(double x) {
+  double res;
+  foreach_active (i) {
+    uniform double r = cbrtf(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR double futrts_exp64(double x) {
+  return exp(x);
+}
+
+SCALAR_FUN_ATTR double futrts_cos64(double x) {
+  return cos(x);
+}
+
+SCALAR_FUN_ATTR double futrts_sin64(double x) {
+  return sin(x);
+}
+
+SCALAR_FUN_ATTR double futrts_tan64(double x) {
+  return tan(x);
+}
+
+SCALAR_FUN_ATTR double futrts_acos64(double x) {
+  return acos(x);
+}
+
+SCALAR_FUN_ATTR double futrts_asin64(double x) {
+  return asin(x);
+}
+
+SCALAR_FUN_ATTR double futrts_atan64(double x) {
+  return atan(x);
+}
+
+SCALAR_FUN_ATTR double futrts_cosh64(double x) {
+  return (exp(x)+exp(-x)) / 2.0d;
+}
+
+SCALAR_FUN_ATTR double futrts_sinh64(double x) {
+  return (exp(x)-exp(-x)) / 2.0d;
+}
+
+SCALAR_FUN_ATTR double futrts_tanh64(double x) {
+  return futrts_sinh64(x)/futrts_cosh64(x);
+}
+
+SCALAR_FUN_ATTR double futrts_acosh64(double x) {
+  double f = x+sqrt(x*x-1.0d);
+  if(futrts_isfinite64(f)) return log(f);
+  return f;
+}
+
+SCALAR_FUN_ATTR double futrts_asinh64(double x) {
+  double f = x+sqrt(x*x+1.0d);
+  if(futrts_isfinite64(f)) return log(f);
+  return f;
+}
+
+SCALAR_FUN_ATTR double futrts_atanh64(double x) {
+  double f = (1.0d+x)/(1.0d-x);
+  if(futrts_isfinite64(f)) return log(f)/2.0d;
+  return f;
+
+}
+
+SCALAR_FUN_ATTR double futrts_atan2_64(double x, double y) {
+  return atan2(x, y);
+}
+
+extern "C" unmasked uniform double hypot(uniform double x, uniform double y);
+SCALAR_FUN_ATTR double futrts_hypot64(double x, double y) {
+  double res;
+  foreach_active (i) {
+    uniform double r = hypot(extract(x, i), extract(y, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform double tgamma(uniform double x);
+SCALAR_FUN_ATTR double futrts_gamma64(double x) {
+  double res;
+  foreach_active (i) {
+    uniform double r = tgamma(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform double lgamma(uniform double x);
+SCALAR_FUN_ATTR double futrts_lgamma64(double x) {
+  double res;
+  foreach_active (i) {
+    uniform double r = lgamma(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform double erf(uniform double x);
+SCALAR_FUN_ATTR double futrts_erf64(double x) {
+  double res;
+  foreach_active (i) {
+    uniform double r = erf(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform double erfc(uniform double x);
+SCALAR_FUN_ATTR double futrts_erfc64(double x) {
+  double res;
+  foreach_active (i) {
+    uniform double r = erfc(extract(x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR double futrts_fma64(double a, double b, double c) {
+  return a * b + c;
+}
+
+SCALAR_FUN_ATTR double futrts_round64(double x) {
+  return round(x);
+}
+
+SCALAR_FUN_ATTR double futrts_ceil64(double x) {
+  return ceil(x);
+}
+
+extern "C" unmasked uniform double nextafter(uniform float x, uniform double y);
+SCALAR_FUN_ATTR float futrts_nextafter64(double x, double y) {
+  double res;
+  foreach_active (i) {
+    uniform double r = nextafter(extract(x, i), extract(y, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR double futrts_floor64(double x) {
+  return floor(x);
+}
+
+SCALAR_FUN_ATTR bool futrts_isnan64(double x) {
+  return isnan(x);
+}
+
+SCALAR_FUN_ATTR int8_t fptosi_f64_i8(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int8_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int16_t fptosi_f64_i16(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int16_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int32_t fptosi_f64_i32(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int32_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int64_t fptosi_f64_i64(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int64_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint8_t fptoui_f64_i8(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint8_t) (int8_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint16_t fptoui_f64_i16(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint16_t) (int16_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint32_t fptoui_f64_i32(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint32_t) (int32_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint64_t fptoui_f64_i64(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint64_t) (int64_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR bool ftob_f64_bool(double x) {
+  return x != 0.0;
+}
+
+SCALAR_FUN_ATTR double btof_bool_f64(bool x) {
+  return x ? 1.0 : 0.0;
+}
+
+SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x) {
+  int64_t res;
+  foreach_active (i) {
+    uniform double tmp = extract(x, i);
+    uniform int64_t r = *((uniform int64_t* uniform)&tmp);
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x) {
+  double res;
+  foreach_active (i) {
+    uniform int64_t tmp = extract(x, i);
+    uniform double r = *((uniform double* uniform)&tmp);
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR double fmod64(double x, double y) {
+  return x - y * trunc(x/y);
+}
+
+SCALAR_FUN_ATTR double fsignum64(double x) {
+  return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d);
+}
+
+SCALAR_FUN_ATTR double futrts_lerp64(double v0, double v1, double t) {
+  return v0 + (v1 - v0) * t;
+}
+
+SCALAR_FUN_ATTR double futrts_ldexp64(double x, int32_t y) {
+  return x * pow((double)2.0, (double)y);
+}
+
+SCALAR_FUN_ATTR double futrts_copysign64(double x, double y) {
+  int64_t xb = futrts_to_bits64(x);
+  int64_t yb = futrts_to_bits64(y);
+  return futrts_from_bits64((xb & ~(((int64_t)1)<<63)) | (yb & (((int64_t)1)<<63)));
+}
+
+SCALAR_FUN_ATTR double futrts_mad64(double a, double b, double c) {
+  return a * b + c;
+}
+
+SCALAR_FUN_ATTR float fpconv_f32_f32(float x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR double fpconv_f32_f64(float x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR float fpconv_f64_f32(double x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR double fpconv_f64_f64(double x) {
+  return (double) x;
+}
+
+#else
+
+SCALAR_FUN_ATTR double fdiv64(double x, double y) {
+  return x / y;
+}
+
+SCALAR_FUN_ATTR double fadd64(double x, double y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR double fsub64(double x, double y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR double fmul64(double x, double y) {
+  return x * y;
+}
+
+SCALAR_FUN_ATTR bool cmplt64(double x, double y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool cmple64(double x, double y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR double sitofp_i8_f64(int8_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double sitofp_i16_f64(int16_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double sitofp_i32_f64(int32_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double sitofp_i64_f64(int64_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i8_f64(uint8_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i16_f64(uint16_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i32_f64(uint32_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double uitofp_i64_f64(uint64_t x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR double fabs64(double x) {
+  return fabs(x);
+}
+
+SCALAR_FUN_ATTR double fmax64(double x, double y) {
+  return fmax(x, y);
+}
+
+SCALAR_FUN_ATTR double fmin64(double x, double y) {
+  return fmin(x, y);
+}
+
+SCALAR_FUN_ATTR double fpow64(double x, double y) {
+  return pow(x, y);
+}
+
+SCALAR_FUN_ATTR double futrts_log64(double x) {
+  return log(x);
+}
+
+SCALAR_FUN_ATTR double futrts_log2_64(double x) {
+  return log2(x);
+}
+
+SCALAR_FUN_ATTR double futrts_log10_64(double x) {
+  return log10(x);
+}
+
+SCALAR_FUN_ATTR double futrts_log1p_64(double x) {
+  return log1p(x);
+}
+
+SCALAR_FUN_ATTR double futrts_sqrt64(double x) {
+  return sqrt(x);
+}
+
+SCALAR_FUN_ATTR double futrts_cbrt64(double x) {
+  return cbrt(x);
+}
+
+SCALAR_FUN_ATTR double futrts_exp64(double x) {
+  return exp(x);
+}
+
+SCALAR_FUN_ATTR double futrts_cos64(double x) {
+  return cos(x);
+}
+
+SCALAR_FUN_ATTR double futrts_sin64(double x) {
+  return sin(x);
+}
+
+SCALAR_FUN_ATTR double futrts_tan64(double x) {
+  return tan(x);
+}
+
+SCALAR_FUN_ATTR double futrts_acos64(double x) {
+  return acos(x);
+}
+
+SCALAR_FUN_ATTR double futrts_asin64(double x) {
+  return asin(x);
+}
+
+SCALAR_FUN_ATTR double futrts_atan64(double x) {
+  return atan(x);
+}
+
+SCALAR_FUN_ATTR double futrts_cosh64(double x) {
+  return cosh(x);
+}
+
+SCALAR_FUN_ATTR double futrts_sinh64(double x) {
+  return sinh(x);
+}
+
+SCALAR_FUN_ATTR double futrts_tanh64(double x) {
+  return tanh(x);
+}
+
+SCALAR_FUN_ATTR double futrts_acosh64(double x) {
+  return acosh(x);
+}
+
+SCALAR_FUN_ATTR double futrts_asinh64(double x) {
+  return asinh(x);
+}
+
+SCALAR_FUN_ATTR double futrts_atanh64(double x) {
+  return atanh(x);
+}
+
+SCALAR_FUN_ATTR double futrts_atan2_64(double x, double y) {
+  return atan2(x, y);
+}
+
+SCALAR_FUN_ATTR double futrts_hypot64(double x, double y) {
+  return hypot(x, y);
+}
+
+SCALAR_FUN_ATTR double futrts_gamma64(double x) {
+  return tgamma(x);
+}
+
+SCALAR_FUN_ATTR double futrts_lgamma64(double x) {
+  return lgamma(x);
+}
+
+SCALAR_FUN_ATTR double futrts_erf64(double x) {
+  return erf(x);
+}
+
+SCALAR_FUN_ATTR double futrts_erfc64(double x) {
+  return erfc(x);
+}
+
+SCALAR_FUN_ATTR double futrts_fma64(double a, double b, double c) {
+  return fma(a, b, c);
+}
+
+SCALAR_FUN_ATTR double futrts_round64(double x) {
+  return rint(x);
+}
+
+SCALAR_FUN_ATTR double futrts_ceil64(double x) {
+  return ceil(x);
+}
+
+SCALAR_FUN_ATTR float futrts_nextafter64(float x, float y) {
+  return nextafter(x, y);
+}
+
+SCALAR_FUN_ATTR double futrts_floor64(double x) {
+  return floor(x);
+}
+
+SCALAR_FUN_ATTR bool futrts_isnan64(double x) {
+  return isnan(x);
+}
+
+SCALAR_FUN_ATTR bool futrts_isinf64(double x) {
+  return isinf(x);
+}
+
+SCALAR_FUN_ATTR int8_t fptosi_f64_i8(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int8_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int16_t fptosi_f64_i16(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int16_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int32_t fptosi_f64_i32(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int32_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR int64_t fptosi_f64_i64(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (int64_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint8_t fptoui_f64_i8(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint8_t) (int8_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint16_t fptoui_f64_i16(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint16_t) (int16_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint32_t fptoui_f64_i32(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint32_t) (int32_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR uint64_t fptoui_f64_i64(double x) {
+  if (futrts_isnan64(x) || futrts_isinf64(x)) {
+    return 0;
+  } else {
+    return (uint64_t) (int64_t) x;
+  }
+}
+
+SCALAR_FUN_ATTR bool ftob_f64_bool(double x) {
+  return x != 0;
+}
+
+SCALAR_FUN_ATTR double btof_bool_f64(bool x) {
+  return x ? 1 : 0;
+}
+
+SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x) {
+  union {
+    double f;
+    int64_t t;
+  } p;
+
+  p.f = x;
+  return p.t;
+}
+
+SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x) {
+  union {
+    int64_t f;
+    double t;
+  } p;
+
+  p.f = x;
+  return p.t;
+}
+
+SCALAR_FUN_ATTR double fmod64(double x, double y) {
+  return fmod(x, y);
+}
+
+SCALAR_FUN_ATTR double fsignum64(double x) {
+  return futrts_isnan64(x) ? x : (x > 0) - (x < 0);
+}
+
+SCALAR_FUN_ATTR double futrts_lerp64(double v0, double v1, double t) {
+#ifdef __OPENCL_VERSION__
+  return mix(v0, v1, t);
+#else
+  return v0 + (v1 - v0) * t;
+#endif
+}
+
+SCALAR_FUN_ATTR double futrts_ldexp64(double x, int32_t y) {
+  return ldexp(x, y);
+}
+
+SCALAR_FUN_ATTR float futrts_copysign64(double x, double y) {
+  return copysign(x, y);
+}
+
+SCALAR_FUN_ATTR double futrts_mad64(double a, double b, double c) {
+#ifdef __OPENCL_VERSION__
+  return mad(a, b, c);
+#else
+  return a * b + c;
+#endif
+}
+
+SCALAR_FUN_ATTR float fpconv_f32_f32(float x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR double fpconv_f32_f64(float x) {
+  return (double) x;
+}
+
+SCALAR_FUN_ATTR float fpconv_f64_f32(double x) {
+  return (float) x;
+}
+
+SCALAR_FUN_ATTR double fpconv_f64_f64(double x) {
+  return (double) x;
+}
+
+#endif
+
+#endif
+
+// End of scalar.h.
+// Start of scalar_f16.h.
+
+// Half-precision is emulated if needed (e.g. in straight C) with the
+// native type used if possible.  The emulation works by typedef'ing
+// 'float' to 'f16', and then implementing all operations on single
+// precision.  To cut down on duplication, we use the same code for
+// those Futhark functions that require just operators or casts.  The
+// in-memory representation for arrays will still be 16 bits even
+// under emulation, so the compiler will have to be careful when
+// generating reads or writes.
+
+#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC))
+#define EMULATE_F16
+#endif
+
+#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#ifdef EMULATE_F16
+
+// Note that the half-precision storage format is still 16 bits - the
+// compiler will have to be real careful!
+typedef float f16;
+
+#elif ISPC
+typedef float16 f16;
+
+#else
+
+#ifdef __CUDA_ARCH__
+#include <cuda_fp16.h>
+#endif
+
+typedef half f16;
+
+#endif
+
+// Some of these functions convert to single precision because half
+// precision versions are not available.
+
+SCALAR_FUN_ATTR f16 fadd16(f16 x, f16 y) {
+  return x + y;
+}
+
+SCALAR_FUN_ATTR f16 fsub16(f16 x, f16 y) {
+  return x - y;
+}
+
+SCALAR_FUN_ATTR f16 fmul16(f16 x, f16 y) {
+  return x * y;
+}
+
+SCALAR_FUN_ATTR bool cmplt16(f16 x, f16 y) {
+  return x < y;
+}
+
+SCALAR_FUN_ATTR bool cmple16(f16 x, f16 y) {
+  return x <= y;
+}
+
+SCALAR_FUN_ATTR f16 sitofp_i8_f16(int8_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR f16 sitofp_i16_f16(int16_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR f16 sitofp_i32_f16(int32_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR f16 sitofp_i64_f16(int64_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR f16 uitofp_i8_f16(uint8_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR f16 uitofp_i16_f16(uint16_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR f16 uitofp_i32_f16(uint32_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR f16 uitofp_i64_f16(uint64_t x) {
+  return (f16) x;
+}
+
+SCALAR_FUN_ATTR int8_t fptosi_f16_i8(f16 x) {
+  return (int8_t) (float) x;
+}
+
+SCALAR_FUN_ATTR int16_t fptosi_f16_i16(f16 x) {
+  return (int16_t) x;
+}
+
+SCALAR_FUN_ATTR int32_t fptosi_f16_i32(f16 x) {
+  return (int32_t) x;
+}
+
+SCALAR_FUN_ATTR int64_t fptosi_f16_i64(f16 x) {
+  return (int64_t) x;
+}
+
+SCALAR_FUN_ATTR uint8_t fptoui_f16_i8(f16 x) {
+  return (uint8_t) (float) x;
+}
+
+SCALAR_FUN_ATTR uint16_t fptoui_f16_i16(f16 x) {
+  return (uint16_t) x;
+}
+
+SCALAR_FUN_ATTR uint32_t fptoui_f16_i32(f16 x) {
+  return (uint32_t) x;
+}
+
+SCALAR_FUN_ATTR uint64_t fptoui_f16_i64(f16 x) {
+  return (uint64_t) x;
+}
+
+SCALAR_FUN_ATTR bool ftob_f16_bool(f16 x) {
+  return x != (f16)0;
+}
+
+SCALAR_FUN_ATTR f16 btof_bool_f16(bool x) {
+  return x ? 1 : 0;
+}
+
+#ifndef EMULATE_F16
+SCALAR_FUN_ATTR bool futrts_isnan16(f16 x) {
+  return isnan((float)x);
+}
+
+#ifdef __OPENCL_VERSION__
+
+SCALAR_FUN_ATTR f16 fabs16(f16 x) {
+  return fabs(x);
+}
+
+SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) {
+  return fmax(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) {
+  return fmin(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) {
+  return pow(x, y);
+}
+
+#elif ISPC
+SCALAR_FUN_ATTR f16 fabs16(f16 x) {
+  return abs(x);
+}
+
+SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) {
+  return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) {
+  return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) {
+  return pow(x, y);
+}
+
+#else // Assuming CUDA.
+
+SCALAR_FUN_ATTR f16 fabs16(f16 x) {
+  return fabsf(x);
+}
+
+SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) {
+  return fmaxf(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) {
+  return fminf(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) {
+  return powf(x, y);
+}
+#endif
+
+#if ISPC
+SCALAR_FUN_ATTR bool futrts_isinf16(float x) {
+  return !futrts_isnan16(x) && futrts_isnan16(x - x);
+}
+SCALAR_FUN_ATTR bool futrts_isfinite16(float x) {
+  return !futrts_isnan16(x) && !futrts_isinf16(x);
+}
+
+#else
+
+SCALAR_FUN_ATTR bool futrts_isinf16(f16 x) {
+  return isinf((float)x);
+}
+#endif
+
+#ifdef __OPENCL_VERSION__
+SCALAR_FUN_ATTR f16 futrts_log16(f16 x) {
+  return log(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) {
+  return log2(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) {
+  return log10(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) {
+  return log1p(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) {
+  return sqrt(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) {
+  return cbrt(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) {
+  return exp(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) {
+  return cos(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) {
+  return sin(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) {
+  return tan(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) {
+  return acos(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) {
+  return asin(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) {
+  return atan(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) {
+  return cosh(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) {
+  return sinh(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) {
+  return tanh(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) {
+  return acosh(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) {
+  return asinh(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) {
+  return atanh(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) {
+  return atan2(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) {
+  return hypot(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) {
+  return tgamma(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) {
+  return lgamma(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) {
+  return erf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) {
+  return erfc(x);
+}
+
+SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) {
+  return fmod(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_round16(f16 x) {
+  return rint(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) {
+  return floor(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) {
+  return ceil(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) {
+  return nextafter(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
+  return mix(v0, v1, t);
+}
+
+SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) {
+  return ldexp(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) {
+  return copysign(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) {
+  return mad(a, b, c);
+}
+
+SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) {
+  return fma(a, b, c);
+}
+#elif ISPC
+
+SCALAR_FUN_ATTR f16 futrts_log16(f16 x) {
+  return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x;
+}
+
+SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) {
+  return futrts_log16(x) / log(2.0f16);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) {
+  return futrts_log16(x) / log(10.0f16);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) {
+  if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16;
+  f16 y = 1.0f16 + x;
+  f16 z = y - 1.0f16;
+  return log(y) - (z-x)/y;
+}
+
+SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) {
+  return (float16)sqrt((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) {
+  return exp(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) {
+  return (float16)cos((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) {
+  return (float16)sin((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) {
+  return (float16)tan((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) {
+  return (float16)acos((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) {
+  return (float16)asin((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) {
+  return (float16)atan((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) {
+  return (exp(x)+exp(-x)) / 2.0f16;
+}
+
+SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) {
+  return (exp(x)-exp(-x)) / 2.0f16;
+}
+
+SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) {
+  return futrts_sinh16(x)/futrts_cosh16(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) {
+  float16 f = x+(float16)sqrt((float)(x*x-1));
+  if(futrts_isfinite16(f)) return log(f);
+  return f;
+}
+
+SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) {
+  float16 f = x+(float16)sqrt((float)(x*x+1));
+  if(futrts_isfinite16(f)) return log(f);
+  return f;
+}
+
+SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) {
+  float16 f = (1+x)/(1-x);
+  if(futrts_isfinite16(f)) return log(f)/2.0f16;
+  return f;
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) {
+  return (float16)atan2((float)x, (float)y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) {
+  return (float16)futrts_hypot32((float)x, (float)y);
+}
+
+extern "C" unmasked uniform float tgammaf(uniform float x);
+SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) {
+  f16 res;
+  foreach_active (i) {
+    uniform f16 r = (f16)tgammaf(extract((float)x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+extern "C" unmasked uniform float lgammaf(uniform float x);
+SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) {
+  f16 res;
+  foreach_active (i) {
+    uniform f16 r = (f16)lgammaf(extract((float)x, i));
+    res = insert(res, i, r);
+  }
+  return res;
+}
+
+SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) {
+  f16 res = (f16)futrts_cbrt32((float)x);
+  return res;
+}
+
+SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) {
+  f16 res = (f16)futrts_erf32((float)x);
+  return res;
+}
+
+SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) {
+  f16 res = (f16)futrts_erfc32((float)x);
+  return res;
+}
+
+SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) {
+  return x - y * (float16)trunc((float) (x/y));
+}
+
+SCALAR_FUN_ATTR f16 futrts_round16(f16 x) {
+  return (float16)round((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) {
+  return (float16)floor((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) {
+  return (float16)ceil((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) {
+  return (float16)futrts_nextafter32((float)x, (float) y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
+  return v0 + (v1 - v0) * t;
+}
+
+SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) {
+  return futrts_ldexp32((float)x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) {
+  return futrts_copysign32((float)x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) {
+  return a * b + c;
+}
+
+SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) {
+  return a * b + c;
+}
+
+#else // Assume CUDA.
+
+SCALAR_FUN_ATTR f16 futrts_log16(f16 x) {
+  return hlog(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) {
+  return hlog2(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) {
+  return hlog10(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) {
+  return (f16)log1pf((float)x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) {
+  return hsqrt(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) {
+  return cbrtf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) {
+  return hexp(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) {
+  return hcos(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) {
+  return hsin(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) {
+  return tanf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) {
+  return acosf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) {
+  return asinf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) {
+  return atanf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) {
+  return coshf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) {
+  return sinhf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) {
+  return tanhf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) {
+  return acoshf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) {
+  return asinhf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) {
+  return atanhf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) {
+  return atan2f(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) {
+  return hypotf(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) {
+  return tgammaf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) {
+  return lgammaf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) {
+  return erff(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) {
+  return erfcf(x);
+}
+
+SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) {
+  return fmodf(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_round16(f16 x) {
+  return rintf(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) {
+  return hfloor(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) {
+  return hceil(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) {
+  return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y)));
+}
+
+SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
+  return v0 + (v1 - v0) * t;
+}
+
+SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) {
+  return futrts_ldexp32((float)x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) {
+  return futrts_copysign32((float)x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) {
+  return a * b + c;
+}
+
+SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) {
+  return fmaf(a, b, c);
+}
+
+#endif
+
+// The CUDA __half type cannot be put in unions for some reason, so we
+// use bespoke conversion functions instead.
+#ifdef __CUDA_ARCH__
+SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) {
+  return __half_as_ushort(x);
+}
+SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) {
+  return __ushort_as_half(x);
+}
+#elif ISPC
+
+SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) {
+  varying int16_t y = *((varying int16_t * uniform)&x);
+  return y;
+}
+
+SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) {
+  varying f16 y = *((varying f16 * uniform)&x);
+  return y;
+}
+#else
+SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) {
+  union {
+    f16 f;
+    int16_t t;
+  } p;
+
+  p.f = x;
+  return p.t;
+}
+
+SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) {
+  union {
+    int16_t f;
+    f16 t;
+  } p;
+
+  p.f = x;
+  return p.t;
+}
+#endif
+
+#else // No native f16 - emulate.
+
+SCALAR_FUN_ATTR f16 fabs16(f16 x) {
+  return fabs32(x);
+}
+
+SCALAR_FUN_ATTR f16 fmax16(f16 x, f16 y) {
+  return fmax32(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fmin16(f16 x, f16 y) {
+  return fmin32(x, y);
+}
+
+SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) {
+  return fpow32(x, y);
+}
+
+SCALAR_FUN_ATTR bool futrts_isnan16(f16 x) {
+  return futrts_isnan32(x);
+}
+
+SCALAR_FUN_ATTR bool futrts_isinf16(f16 x) {
+  return futrts_isinf32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log16(f16 x) {
+  return futrts_log32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log2_16(f16 x) {
+  return futrts_log2_32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log10_16(f16 x) {
+  return futrts_log10_32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_log1p_16(f16 x) {
+  return futrts_log1p_32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sqrt16(f16 x) {
+  return futrts_sqrt32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cbrt16(f16 x) {
+  return futrts_cbrt32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_exp16(f16 x) {
+  return futrts_exp32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cos16(f16 x) {
+  return futrts_cos32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sin16(f16 x) {
+  return futrts_sin32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_tan16(f16 x) {
+  return futrts_tan32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acos16(f16 x) {
+  return futrts_acos32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_asin16(f16 x) {
+  return futrts_asin32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan16(f16 x) {
+  return futrts_atan32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_cosh16(f16 x) {
+  return futrts_cosh32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_sinh16(f16 x) {
+  return futrts_sinh32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_tanh16(f16 x) {
+  return futrts_tanh32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_acosh16(f16 x) {
+  return futrts_acosh32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_asinh16(f16 x) {
+  return futrts_asinh32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atanh16(f16 x) {
+  return futrts_atanh32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_atan2_16(f16 x, f16 y) {
+  return futrts_atan2_32(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_hypot16(f16 x, f16 y) {
+  return futrts_hypot32(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_gamma16(f16 x) {
+  return futrts_gamma32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_lgamma16(f16 x) {
+  return futrts_lgamma32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_erf16(f16 x) {
+  return futrts_erf32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_erfc16(f16 x) {
+  return futrts_erfc32(x);
+}
+
+SCALAR_FUN_ATTR f16 fmod16(f16 x, f16 y) {
+  return fmod32(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_round16(f16 x) {
+  return futrts_round32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_floor16(f16 x) {
+  return futrts_floor32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_ceil16(f16 x) {
+  return futrts_ceil32(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_nextafter16(f16 x, f16 y) {
+  return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y)));
+}
+
+SCALAR_FUN_ATTR f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
+  return futrts_lerp32(v0, v1, t);
+}
+
+SCALAR_FUN_ATTR f16 futrts_ldexp16(f16 x, int32_t y) {
+  return futrts_ldexp32(x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_copysign16(f16 x, f16 y) {
+  return futrts_copysign32((float)x, y);
+}
+
+SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) {
+  return futrts_mad32(a, b, c);
+}
+
+SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) {
+  return futrts_fma32(a, b, c);
+}
+
+// Even when we are using an OpenCL that does not support cl_khr_fp16,
+// it must still support vload_half for actually creating a
+// half-precision number, which can then be efficiently converted to a
+// float.  Similarly for vstore_half.
+#ifdef __OPENCL_VERSION__
+
+SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) {
+  int16_t y;
+  // Violating strict aliasing here.
+  vstore_half((float)x, 0, (half*)&y);
+  return y;
+}
+
+SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) {
+  return (f16)vload_half(0, (half*)&x);
+}
+
+#else
+
+SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) {
+  return (int16_t)float2halfbits(x);
+}
+
+SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) {
+  return halfbits2float((uint16_t)x);
+}
+
+SCALAR_FUN_ATTR f16 fsignum16(f16 x) {
+  return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);
+}
+
+#endif
+
+#endif
+
+SCALAR_FUN_ATTR float fpconv_f16_f16(f16 x) {
+  return x;
+}
+
+SCALAR_FUN_ATTR float fpconv_f16_f32(f16 x) {
+  return x;
+}
+
+SCALAR_FUN_ATTR f16 fpconv_f32_f16(float x) {
+  return (f16) x;
+}
+
+#ifdef FUTHARK_F64_ENABLED
+
+SCALAR_FUN_ATTR double fpconv_f16_f64(f16 x) {
+  return (double) x;
+}
+
+#if ISPC
+SCALAR_FUN_ATTR f16 fpconv_f64_f16(double x) {
+  return (f16) ((float)x);
+}
+#else
+SCALAR_FUN_ATTR f16 fpconv_f64_f16(double x) {
+  return (f16) x;
+}
+#endif
+#endif
+
+
+// End of scalar_f16.h.
+// Start of atomics.h
+
+SCALAR_FUN_ATTR int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_xchg_i32_shared(volatile __local int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p,
+                                                         int32_t cmp, int32_t val);
+SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_shared(volatile __local int32_t *p,
+                                                        int32_t cmp, int32_t val);
+SCALAR_FUN_ATTR int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_add_i32_shared(volatile __local int32_t *p, int32_t x);
+SCALAR_FUN_ATTR float atomic_fadd_f32_global(volatile __global float *p, float x);
+SCALAR_FUN_ATTR float atomic_fadd_f32_shared(volatile __local float *p, float x);
+SCALAR_FUN_ATTR int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_smax_i32_shared(volatile __local int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_smin_i32_shared(volatile __local int32_t *p, int32_t x);
+SCALAR_FUN_ATTR uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x);
+SCALAR_FUN_ATTR uint32_t atomic_umax_i32_shared(volatile __local uint32_t *p, uint32_t x);
+SCALAR_FUN_ATTR uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x);
+SCALAR_FUN_ATTR uint32_t atomic_umin_i32_shared(volatile __local uint32_t *p, uint32_t x);
+SCALAR_FUN_ATTR int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_and_i32_shared(volatile __local int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_or_i32_shared(volatile __local int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_xor_i32_global(volatile __global int32_t *p, int32_t x);
+SCALAR_FUN_ATTR int32_t atomic_xor_i32_shared(volatile __local int32_t *p, int32_t x);
+
+SCALAR_FUN_ATTR int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicExch((int32_t*)p, x);
+#else
+  return atomic_xor(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_xchg_i32_shared(volatile __local int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicExch((int32_t*)p, x);
+#else
+  return atomic_xor(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p,
+                                                         int32_t cmp, int32_t val) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicCAS((int32_t*)p, cmp, val);
+#else
+  return atomic_cmpxchg(p, cmp, val);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_cmpxchg_i32_shared(volatile __local int32_t *p,
+                                                        int32_t cmp, int32_t val) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicCAS((int32_t*)p, cmp, val);
+#else
+  return atomic_cmpxchg(p, cmp, val);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAdd((int32_t*)p, x);
+#else
+  return atomic_add(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_add_i32_shared(volatile __local int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAdd((int32_t*)p, x);
+#else
+  return atomic_add(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR float atomic_fadd_f32_global(volatile __global float *p, float x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAdd((float*)p, x);
+#else
+  union { int32_t i; float f; } old;
+  union { int32_t i; float f; } assumed;
+  old.f = *p;
+  do {
+    assumed.f = old.f;
+    old.f = old.f + x;
+    old.i = atomic_cmpxchg_i32_global((volatile __global int32_t*)p, assumed.i, old.i);
+  } while (assumed.i != old.i);
+  return old.f;
+#endif
+}
+
+SCALAR_FUN_ATTR float atomic_fadd_f32_shared(volatile __local float *p, float x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAdd((float*)p, x);
+#else
+  union { int32_t i; float f; } old;
+  union { int32_t i; float f; } assumed;
+  old.f = *p;
+  do {
+    assumed.f = old.f;
+    old.f = old.f + x;
+    old.i = atomic_cmpxchg_i32_shared((volatile __local int32_t*)p, assumed.i, old.i);
+  } while (assumed.i != old.i);
+  return old.f;
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMax((int32_t*)p, x);
+#else
+  return atomic_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_smax_i32_shared(volatile __local int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMax((int32_t*)p, x);
+#else
+  return atomic_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMin((int32_t*)p, x);
+#else
+  return atomic_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_smin_i32_shared(volatile __local int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMin((int32_t*)p, x);
+#else
+  return atomic_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMax((uint32_t*)p, x);
+#else
+  return atomic_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint32_t atomic_umax_i32_shared(volatile __local uint32_t *p, uint32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMax((uint32_t*)p, x);
+#else
+  return atomic_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMin((uint32_t*)p, x);
+#else
+  return atomic_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint32_t atomic_umin_i32_shared(volatile __local uint32_t *p, uint32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMin((uint32_t*)p, x);
+#else
+  return atomic_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAnd((int32_t*)p, x);
+#else
+  return atomic_and(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_and_i32_shared(volatile __local int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAnd((int32_t*)p, x);
+#else
+  return atomic_and(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicOr((int32_t*)p, x);
+#else
+  return atomic_or(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_or_i32_shared(volatile __local int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicOr((int32_t*)p, x);
+#else
+  return atomic_or(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_xor_i32_global(volatile __global int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicXor((int32_t*)p, x);
+#else
+  return atomic_xor(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int32_t atomic_xor_i32_shared(volatile __local int32_t *p, int32_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicXor((int32_t*)p, x);
+#else
+  return atomic_xor(p, x);
+#endif
+}
+
+// Start of 64 bit atomics
+
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) || defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+
+SCALAR_FUN_ATTR int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_xchg_i64_shared(volatile __local int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p,
+                                                         int64_t cmp, int64_t val);
+SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_shared(volatile __local int64_t *p,
+                                                        int64_t cmp, int64_t val);
+SCALAR_FUN_ATTR int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_add_i64_shared(volatile __local int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_smax_i64_shared(volatile __local int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_smin_i64_shared(volatile __local int64_t *p, int64_t x);
+SCALAR_FUN_ATTR uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x);
+SCALAR_FUN_ATTR uint64_t atomic_umax_i64_shared(volatile __local uint64_t *p, uint64_t x);
+SCALAR_FUN_ATTR uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x);
+SCALAR_FUN_ATTR uint64_t atomic_umin_i64_shared(volatile __local uint64_t *p, uint64_t x);
+SCALAR_FUN_ATTR int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_and_i64_shared(volatile __local int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_or_i64_shared(volatile __local int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_xor_i64_global(volatile __global int64_t *p, int64_t x);
+SCALAR_FUN_ATTR int64_t atomic_xor_i64_shared(volatile __local int64_t *p, int64_t x);
+
+#ifdef FUTHARK_F64_ENABLED
+SCALAR_FUN_ATTR double atomic_fadd_f64_global(volatile __global double *p, double x);
+SCALAR_FUN_ATTR double atomic_fadd_f64_shared(volatile __local double *p, double x);
+#endif
+
+SCALAR_FUN_ATTR int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicExch((uint64_t*)p, x);
+#else
+  return atom_xor(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_xchg_i64_shared(volatile __local int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicExch((uint64_t*)p, x);
+#else
+  return atom_xor(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p,
+                                                         int64_t cmp, int64_t val) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicCAS((uint64_t*)p, cmp, val);
+#else
+  return atom_cmpxchg(p, cmp, val);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_cmpxchg_i64_shared(volatile __local int64_t *p,
+                                                        int64_t cmp, int64_t val) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicCAS((uint64_t*)p, cmp, val);
+#else
+  return atom_cmpxchg(p, cmp, val);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAdd((uint64_t*)p, x);
+#else
+  return atom_add(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_add_i64_shared(volatile __local int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAdd((uint64_t*)p, x);
+#else
+  return atom_add(p, x);
+#endif
+}
+
+#ifdef FUTHARK_F64_ENABLED
+
+SCALAR_FUN_ATTR double atomic_fadd_f64_global(volatile __global double *p, double x) {
+#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600 || defined(FUTHARK_HIP)
+  return atomicAdd((double*)p, x);
+#else
+  union { int64_t i; double f; } old;
+  union { int64_t i; double f; } assumed;
+  old.f = *p;
+  do {
+    assumed.f = old.f;
+    old.f = old.f + x;
+    old.i = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed.i, old.i);
+  } while (assumed.i != old.i);
+  return old.f;
+#endif
+}
+
+SCALAR_FUN_ATTR double atomic_fadd_f64_shared(volatile __local double *p, double x) {
+#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600 || defined(FUTHARK_HIP)
+  return atomicAdd((double*)p, x);
+#else
+  union { int64_t i; double f; } old;
+  union { int64_t i; double f; } assumed;
+  old.f = *p;
+  do {
+    assumed.f = old.f;
+    old.f = old.f + x;
+    old.i = atomic_cmpxchg_i64_shared((volatile __local int64_t*)p, assumed.i, old.i);
+  } while (assumed.i != old.i);
+  return old.f;
+#endif
+}
+
+#endif
+
+SCALAR_FUN_ATTR int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA)
+  return atomicMax((int64_t*)p, x);
+#elif defined(FUTHARK_HIP)
+  // Currentely missing in HIP; probably a temporary oversight.
+  int64_t old = *p, assumed;
+  do {
+    assumed = old;
+    old = smax64(old, x);
+    old = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed, old);
+  } while (assumed != old);
+  return old;
+#else
+  return atom_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_smax_i64_shared(volatile __local int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA)
+  return atomicMax((int64_t*)p, x);
+#elif defined(FUTHARK_HIP)
+  // Currentely missing in HIP; probably a temporary oversight.
+  int64_t old = *p, assumed;
+  do {
+    assumed = old;
+    old = smax64(old, x);
+    old = atomic_cmpxchg_i64_shared((volatile __local int64_t*)p, assumed, old);
+  } while (assumed != old);
+  return old;
+#else
+  return atom_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA)
+  return atomicMin((int64_t*)p, x);
+#elif defined(FUTHARK_HIP)
+  // Currentely missing in HIP; probably a temporary oversight.
+  int64_t old = *p, assumed;
+  do {
+    assumed = old;
+    old = smin64(old, x);
+    old = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed, old);
+  } while (assumed != old);
+  return old;
+#else
+  return atom_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_smin_i64_shared(volatile __local int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA)
+  return atomicMin((int64_t*)p, x);
+#elif defined(FUTHARK_HIP)
+  // Currentely missing in HIP; probably a temporary oversight.
+  int64_t old = *p, assumed;
+  do {
+    assumed = old;
+    old = smin64(old, x);
+    old = atomic_cmpxchg_i64_shared((volatile __local int64_t*)p, assumed, old);
+  } while (assumed != old);
+  return old;
+#else
+  return atom_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMax((uint64_t*)p, x);
+#else
+  return atom_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint64_t atomic_umax_i64_shared(volatile __local uint64_t *p, uint64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMax((uint64_t*)p, x);
+#else
+  return atom_max(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMin((uint64_t*)p, x);
+#else
+  return atom_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR uint64_t atomic_umin_i64_shared(volatile __local uint64_t *p, uint64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicMin((uint64_t*)p, x);
+#else
+  return atom_min(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAnd((uint64_t*)p, x);
+#else
+  return atom_and(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_and_i64_shared(volatile __local int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicAnd((uint64_t*)p, x);
+#else
+  return atom_and(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicOr((uint64_t*)p, x);
+#else
+  return atom_or(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_or_i64_shared(volatile __local int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicOr((uint64_t*)p, x);
+#else
+  return atom_or(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_xor_i64_global(volatile __global int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicXor((uint64_t*)p, x);
+#else
+  return atom_xor(p, x);
+#endif
+}
+
+SCALAR_FUN_ATTR int64_t atomic_xor_i64_shared(volatile __local int64_t *p, int64_t x) {
+#if defined(FUTHARK_CUDA) || defined(FUTHARK_HIP)
+  return atomicXor((uint64_t*)p, x);
+#else
+  return atom_xor(p, x);
+#endif
+}
+
+#endif // defined(FUTHARK_CUDA) || defined(FUTHARK_HIP) || defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+
+// End of atomics.h
+// Start of transpose.cl
+
+#define GEN_TRANSPOSE_KERNELS(NAME, ELEM_TYPE)                          \
+FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM*2, TR_TILE_DIM/TR_ELEMS_PER_THREAD, 1)\
+void map_transpose_##NAME(SHARED_MEM_PARAM                              \
+                          __global ELEM_TYPE *dst_mem,                  \
+                          int64_t dst_offset,                           \
+                          __global ELEM_TYPE *src_mem,                  \
+                          int64_t src_offset,                           \
+                          int32_t num_arrays,                           \
+                          int32_t x_elems,                              \
+                          int32_t y_elems,                              \
+                          int32_t mulx,                                 \
+                          int32_t muly,                                 \
+                          int32_t repeat_1,                             \
+                          int32_t repeat_2) {                           \
+  (void)mulx; (void)muly;                                               \
+  __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem;            \
+  int tblock_id_0 = get_tblock_id(0);                                   \
+  int global_id_0 = get_global_id(0);                                   \
+  int tblock_id_1 = get_tblock_id(1);                                   \
+  int global_id_1 = get_global_id(1);                                   \
+  for (int i1 = 0; i1 <= repeat_1; i1++) {                              \
+    int tblock_id_2 = get_tblock_id(2);                                 \
+    int global_id_2 = get_global_id(2);                                 \
+    for (int i2 = 0; i2 <= repeat_2; i2++) {                            \
+      int32_t our_array_offset = tblock_id_2 * x_elems * y_elems;       \
+      int32_t odata_offset = dst_offset + our_array_offset;             \
+      int32_t idata_offset = src_offset + our_array_offset;             \
+      int32_t x_index = global_id_0;                                    \
+      int32_t y_index = tblock_id_1 * TR_TILE_DIM + get_local_id(1);    \
+      if (x_index < x_elems) {                                          \
+        for (int32_t j = 0; j < TR_ELEMS_PER_THREAD; j++) {             \
+          int32_t index_i = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * x_elems + x_index; \
+          if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < y_elems) { \
+            block[(get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * (TR_TILE_DIM+1) + \
+                  get_local_id(0)] =                                    \
+              src_mem[idata_offset + index_i];                          \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+      barrier_local();                                                  \
+      x_index = tblock_id_1 * TR_TILE_DIM + get_local_id(0);            \
+      y_index = tblock_id_0 * TR_TILE_DIM + get_local_id(1);            \
+      if (x_index < y_elems) {                                          \
+        for (int32_t j = 0; j < TR_ELEMS_PER_THREAD; j++) {             \
+          int32_t index_out = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * y_elems + x_index; \
+          if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < x_elems) { \
+            dst_mem[(odata_offset + index_out)] =                       \
+              block[get_local_id(0) * (TR_TILE_DIM+1) +                 \
+                    get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)]; \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+      tblock_id_2 += get_num_tblocks(2);                                \
+      global_id_2 += get_global_size(2);                                \
+    }                                                                   \
+    tblock_id_1 += get_num_tblocks(1);                                  \
+    global_id_1 += get_global_size(1);                                  \
+  }                                                                     \
+}                                                                       \
+                                                                        \
+FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM, TR_BLOCK_DIM, 1)                     \
+void map_transpose_##NAME##_low_height(SHARED_MEM_PARAM                 \
+                                                __global ELEM_TYPE *dst_mem, \
+                                                int64_t dst_offset,     \
+                                                __global ELEM_TYPE *src_mem, \
+                                                int64_t src_offset,     \
+                                                int32_t num_arrays,     \
+                                                int32_t x_elems,        \
+                                                int32_t y_elems,        \
+                                                int32_t mulx,           \
+                                                int32_t muly,           \
+                                                int32_t repeat_1,       \
+                                                int32_t repeat_2) {     \
+  __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem;            \
+  int tblock_id_0 = get_tblock_id(0);                                   \
+  int global_id_0 = get_global_id(0);                                   \
+  int tblock_id_1 = get_tblock_id(1);                                   \
+  int global_id_1 = get_global_id(1);                                   \
+  for (int i1 = 0; i1 <= repeat_1; i1++) {                              \
+    int tblock_id_2 = get_tblock_id(2);                                 \
+    int global_id_2 = get_global_id(2);                                 \
+    for (int i2 = 0; i2 <= repeat_2; i2++) {                            \
+      int32_t our_array_offset = tblock_id_2 * x_elems * y_elems;       \
+      int32_t odata_offset = dst_offset + our_array_offset;             \
+      int32_t idata_offset = src_offset + our_array_offset;             \
+      int32_t x_index =                                                 \
+        tblock_id_0 * TR_BLOCK_DIM * mulx +                             \
+        get_local_id(0) +                                               \
+        get_local_id(1)%mulx * TR_BLOCK_DIM;                            \
+      int32_t y_index = tblock_id_1 * TR_BLOCK_DIM + get_local_id(1)/mulx; \
+      int32_t index_in = y_index * x_elems + x_index;                   \
+      if (x_index < x_elems && y_index < y_elems) {                     \
+        block[get_local_id(1) * (TR_BLOCK_DIM+1) + get_local_id(0)] =   \
+          src_mem[idata_offset + index_in];                             \
+      }                                                                 \
+      barrier_local();                                                  \
+      x_index = tblock_id_1 * TR_BLOCK_DIM + get_local_id(0)/mulx;      \
+      y_index =                                                         \
+        tblock_id_0 * TR_BLOCK_DIM * mulx +                             \
+        get_local_id(1) +                                               \
+        (get_local_id(0)%mulx) * TR_BLOCK_DIM;                          \
+      int32_t index_out = y_index * y_elems + x_index;                  \
+      if (x_index < y_elems && y_index < x_elems) {                     \
+        dst_mem[odata_offset + index_out] =                             \
+          block[get_local_id(0) * (TR_BLOCK_DIM+1) + get_local_id(1)];  \
+      }                                                                 \
+      tblock_id_2 += get_num_tblocks(2);                                \
+      global_id_2 += get_global_size(2);                                \
+    }                                                                   \
+    tblock_id_1 += get_num_tblocks(1);                                  \
+    global_id_1 += get_global_size(1);                                  \
+  }                                                                     \
+}                                                                       \
+                                                                        \
+FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM, TR_BLOCK_DIM, 1)                     \
+void map_transpose_##NAME##_low_width(SHARED_MEM_PARAM                  \
+                                      __global ELEM_TYPE *dst_mem,      \
+                                      int64_t dst_offset,               \
+                                      __global ELEM_TYPE *src_mem,      \
+                                      int64_t src_offset,               \
+                                      int32_t num_arrays,               \
+                                      int32_t x_elems,                  \
+                                      int32_t y_elems,                  \
+                                      int32_t mulx,                     \
+                                      int32_t muly,                     \
+                                      int32_t repeat_1,                 \
+                                      int32_t repeat_2) {               \
+  __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem;            \
+  int tblock_id_0 = get_tblock_id(0);                                   \
+  int global_id_0 = get_global_id(0);                                   \
+  int tblock_id_1 = get_tblock_id(1);                                   \
+  int global_id_1 = get_global_id(1);                                   \
+  for (int i1 = 0; i1 <= repeat_1; i1++) {                              \
+    int tblock_id_2 = get_tblock_id(2);                                 \
+    int global_id_2 = get_global_id(2);                                 \
+    for (int i2 = 0; i2 <= repeat_2; i2++) {                            \
+      int32_t our_array_offset = tblock_id_2 * x_elems * y_elems;       \
+      int32_t odata_offset = dst_offset + our_array_offset;             \
+      int32_t idata_offset = src_offset + our_array_offset;             \
+      int32_t x_index = tblock_id_0 * TR_BLOCK_DIM + get_local_id(0)/muly; \
+      int32_t y_index =                                                 \
+        tblock_id_1 * TR_BLOCK_DIM * muly +                             \
+        get_local_id(1) + (get_local_id(0)%muly) * TR_BLOCK_DIM;        \
+      int32_t index_in = y_index * x_elems + x_index;                   \
+      if (x_index < x_elems && y_index < y_elems) {                     \
+        block[get_local_id(1) * (TR_BLOCK_DIM+1) + get_local_id(0)] =   \
+          src_mem[idata_offset + index_in];                             \
+      }                                                                 \
+      barrier_local();                                                  \
+      x_index = tblock_id_1 * TR_BLOCK_DIM * muly +                     \
+        get_local_id(0) + (get_local_id(1)%muly) * TR_BLOCK_DIM;        \
+      y_index = tblock_id_0 * TR_BLOCK_DIM + get_local_id(1)/muly;      \
+      int32_t index_out = y_index * y_elems + x_index;                  \
+      if (x_index < y_elems && y_index < x_elems) {                     \
+        dst_mem[odata_offset + index_out] =                             \
+          block[get_local_id(0) * (TR_BLOCK_DIM+1) + get_local_id(1)];  \
+      }                                                                 \
+      tblock_id_2 += get_num_tblocks(2);                                \
+      global_id_2 += get_num_tblocks(2) * get_local_size(2);            \
+    }                                                                   \
+    tblock_id_1 += get_num_tblocks(1);                                  \
+    global_id_1 += get_num_tblocks(1) * get_local_size(1);              \
+  }                                                                     \
+}                                                                       \
+                                                                        \
+FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM*TR_BLOCK_DIM, 1, 1)                   \
+void map_transpose_##NAME##_small(SHARED_MEM_PARAM                       \
+                                  __global ELEM_TYPE *dst_mem,          \
+                                  int64_t dst_offset,                   \
+                                  __global ELEM_TYPE *src_mem,          \
+                                  int64_t src_offset,                   \
+                                  int32_t num_arrays,                   \
+                                  int32_t x_elems,                      \
+                                  int32_t y_elems,                      \
+                                  int32_t mulx,                         \
+                                  int32_t muly,                         \
+                                  int32_t repeat_1,                     \
+                                  int32_t repeat_2) {                   \
+  (void)mulx; (void)muly;                                               \
+  __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem;            \
+  int tblock_id_0 = get_tblock_id(0);                                   \
+  int global_id_0 = get_global_id(0);                                   \
+  int tblock_id_1 = get_tblock_id(1);                                   \
+  int global_id_1 = get_global_id(1);                                   \
+  for (int i1 = 0; i1 <= repeat_1; i1++) {                              \
+    int tblock_id_2 = get_tblock_id(2);                                 \
+    int global_id_2 = get_global_id(2);                                 \
+    for (int i2 = 0; i2 <= repeat_2; i2++) {                            \
+      int32_t our_array_offset = global_id_0/(y_elems * x_elems) * y_elems * x_elems; \
+      int32_t x_index = (global_id_0 % (y_elems * x_elems))/y_elems;    \
+      int32_t y_index = global_id_0%y_elems;                            \
+      int32_t odata_offset = dst_offset + our_array_offset;             \
+      int32_t idata_offset = src_offset + our_array_offset;             \
+      int32_t index_in = y_index * x_elems + x_index;                   \
+      int32_t index_out = x_index * y_elems + y_index;                  \
+      if (global_id_0 < x_elems * y_elems * num_arrays) {               \
+        dst_mem[odata_offset + index_out] = src_mem[idata_offset + index_in]; \
+      }                                                                 \
+      tblock_id_2 += get_num_tblocks(2);                                \
+      global_id_2 += get_global_size(2);                                \
+    }                                                                   \
+    tblock_id_1 += get_num_tblocks(1);                                  \
+    global_id_1 += get_global_size(1);                                  \
+  }                                                                     \
+}                                                                       \
+                                                                        \
+FUTHARK_KERNEL_SIZED(TR_BLOCK_DIM*2, TR_TILE_DIM/TR_ELEMS_PER_THREAD, 1)\
+void map_transpose_##NAME##_large(SHARED_MEM_PARAM                      \
+                                  __global ELEM_TYPE *dst_mem,          \
+                                  int64_t dst_offset,                   \
+                                  __global ELEM_TYPE *src_mem,          \
+                                  int64_t src_offset,                   \
+                                  int64_t num_arrays,                   \
+                                  int64_t x_elems,                      \
+                                  int64_t y_elems,                      \
+                                  int64_t mulx,                         \
+                                  int64_t muly,                         \
+                                  int32_t repeat_1,                     \
+                                  int32_t repeat_2) {                   \
+  (void)mulx; (void)muly;                                               \
+  __local ELEM_TYPE* block = (__local ELEM_TYPE*)shared_mem;             \
+  int tblock_id_0 = get_tblock_id(0);                                   \
+  int global_id_0 = get_global_id(0);                                   \
+  int tblock_id_1 = get_tblock_id(1);                                   \
+  int global_id_1 = get_global_id(1);                                   \
+  for (int i1 = 0; i1 <= repeat_1; i1++) {                              \
+    int tblock_id_2 = get_tblock_id(2);                                 \
+    int global_id_2 = get_global_id(2);                                 \
+    for (int i2 = 0; i2 <= repeat_2; i2++) {                            \
+      int64_t our_array_offset = tblock_id_2 * x_elems * y_elems;       \
+      int64_t odata_offset = dst_offset + our_array_offset;             \
+      int64_t idata_offset = src_offset + our_array_offset;             \
+      int64_t x_index = global_id_0;                                    \
+      int64_t y_index = tblock_id_1 * TR_TILE_DIM + get_local_id(1);    \
+      if (x_index < x_elems) {                                          \
+        for (int64_t j = 0; j < TR_ELEMS_PER_THREAD; j++) {             \
+          int64_t index_i = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * x_elems + x_index; \
+          if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < y_elems) { \
+            block[(get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * (TR_TILE_DIM+1) + \
+                  get_local_id(0)] =                                    \
+              src_mem[idata_offset + index_i];                          \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+      barrier_local();                                                  \
+      x_index = tblock_id_1 * TR_TILE_DIM + get_local_id(0);            \
+      y_index = tblock_id_0 * TR_TILE_DIM + get_local_id(1);            \
+      if (x_index < y_elems) {                                          \
+        for (int64_t j = 0; j < TR_ELEMS_PER_THREAD; j++) {             \
+          int64_t index_out = (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)) * y_elems + x_index; \
+          if (y_index + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD) < x_elems) { \
+            dst_mem[(odata_offset + index_out)] =                       \
+              block[get_local_id(0) * (TR_TILE_DIM+1) +                 \
+                    get_local_id(1) + j * (TR_TILE_DIM/TR_ELEMS_PER_THREAD)]; \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+      tblock_id_2 += get_num_tblocks(2);                                \
+      global_id_2 += get_global_size(2);                                \
+    }                                                                   \
+    tblock_id_1 += get_num_tblocks(1);                                  \
+    global_id_1 += get_global_size(1);                                  \
+  }                                                                     \
+}                                                                       \
+
+GEN_TRANSPOSE_KERNELS(1b, uint8_t)
+GEN_TRANSPOSE_KERNELS(2b, uint16_t)
+GEN_TRANSPOSE_KERNELS(4b, uint32_t)
+GEN_TRANSPOSE_KERNELS(8b, uint64_t)
+
+// End of transpose.cl
+// Start of copy.cl
+
+#define GEN_COPY_KERNEL(NAME, ELEM_TYPE) \
+FUTHARK_KERNEL void lmad_copy_##NAME(SHARED_MEM_PARAM                   \
+                               __global ELEM_TYPE *dst_mem,             \
+                               int64_t dst_offset,                      \
+                               __global ELEM_TYPE *src_mem,             \
+                               int64_t src_offset,                      \
+                               int64_t n,                               \
+                               int r,                                   \
+                               int64_t shape0, int64_t dst_stride0, int64_t src_stride0, \
+                               int64_t shape1, int64_t dst_stride1, int64_t src_stride1, \
+                               int64_t shape2, int64_t dst_stride2, int64_t src_stride2, \
+                               int64_t shape3, int64_t dst_stride3, int64_t src_stride3, \
+                               int64_t shape4, int64_t dst_stride4, int64_t src_stride4, \
+                               int64_t shape5, int64_t dst_stride5, int64_t src_stride5, \
+                               int64_t shape6, int64_t dst_stride6, int64_t src_stride6, \
+                               int64_t shape7, int64_t dst_stride7, int64_t src_stride7) { \
+  int64_t gtid = get_global_id(0);                                      \
+  int64_t remainder = gtid;                                             \
+                                                                        \
+  if (gtid >= n) {                                                      \
+    return;                                                             \
+  }                                                                     \
+                                                                        \
+  if (r > 0) {                                                          \
+    int64_t i = remainder % shape0;                                     \
+    dst_offset += i * dst_stride0;                                      \
+    src_offset += i * src_stride0;                                      \
+    remainder /= shape0;                                                \
+  }                                                                     \
+  if (r > 1) {                                                          \
+    int64_t i = remainder % shape1;                                     \
+    dst_offset += i * dst_stride1;                                      \
+    src_offset += i * src_stride1;                                      \
+    remainder /= shape1;                                                \
+  }                                                                     \
+  if (r > 2) {                                                          \
+    int64_t i = remainder % shape2;                                     \
+    dst_offset += i * dst_stride2;                                      \
+    src_offset += i * src_stride2;                                      \
+    remainder /= shape2;                                                \
+  }                                                                     \
+  if (r > 3) {                                                          \
+    int64_t i = remainder % shape3;                                     \
+    dst_offset += i * dst_stride3;                                      \
+    src_offset += i * src_stride3;                                      \
+    remainder /= shape3;                                                \
+  }                                                                     \
+  if (r > 4) {                                                          \
+    int64_t i = remainder % shape4;                                     \
+    dst_offset += i * dst_stride4;                                      \
+    src_offset += i * src_stride4;                                      \
+    remainder /= shape4;                                                \
+  }                                                                     \
+  if (r > 5) {                                                          \
+    int64_t i = remainder % shape5;                                     \
+    dst_offset += i * dst_stride5;                                      \
+    src_offset += i * src_stride5;                                      \
+    remainder /= shape5;                                                \
+  }                                                                     \
+  if (r > 6) {                                                          \
+    int64_t i = remainder % shape6;                                     \
+    dst_offset += i * dst_stride6;                                      \
+    src_offset += i * src_stride6;                                      \
+    remainder /= shape6;                                                \
+  }                                                                     \
+  if (r > 7) {                                                          \
+    int64_t i = remainder % shape7;                                     \
+    dst_offset += i * dst_stride7;                                      \
+    src_offset += i * src_stride7;                                      \
+    remainder /= shape7;                                                \
+  }                                                                     \
+                                                                        \
+  dst_mem[dst_offset] = src_mem[src_offset];                            \
+}
+
+GEN_COPY_KERNEL(1b, uint8_t)
+GEN_COPY_KERNEL(2b, uint16_t)
+GEN_COPY_KERNEL(4b, uint32_t)
+GEN_COPY_KERNEL(8b, uint64_t)
+
+// End of copy.cl
+
+
+
+FUTHARK_KERNEL
+void builtinzhreplicate_i32zireplicate_6875(__local uint64_t *shared_mem_aligned, int64_t num_elems_6871, int32_t val_6872, int64_t replicate_n_6874, int64_t virt_num_tblocks_6880, int64_t num_tblocks_6881, __global unsigned char *mem_6870)
+{
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    int32_t replicate_ltid_6876;
+    int32_t tblock_sizze_6878;
+    int32_t replicate_gid_6877;
+    int32_t replicate_gtid_6875;
+    int32_t phys_tblock_id_6882;
+    int32_t iterations_6883;
+    
+    replicate_ltid_6876 = get_local_id(0);
+    tblock_sizze_6878 = get_local_size(0);
+    replicate_gid_6877 = get_tblock_id(0);
+    replicate_gtid_6875 = replicate_gid_6877 * tblock_sizze_6878 + replicate_ltid_6876;
+    phys_tblock_id_6882 = get_tblock_id(0);
+    iterations_6883 = sdiv_up32(sext_i64_i32(virt_num_tblocks_6880) - phys_tblock_id_6882, sext_i64_i32(num_tblocks_6881));
+    for (int32_t i_6884 = 0; i_6884 < iterations_6883; i_6884++) {
+        int32_t virt_tblock_id_6885;
+        int64_t global_tid_6886;
+        int64_t slice_6888;
+        int64_t rep_i_6887;
+        int64_t remnant_6889;
+        
+        virt_tblock_id_6885 = phys_tblock_id_6882 + i_6884 * sext_i64_i32(num_tblocks_6881);
+        global_tid_6886 = sext_i32_i64(virt_tblock_id_6885) * sext_i32_i64(tblock_sizze_6878) + sext_i32_i64(replicate_ltid_6876);
+        slice_6888 = num_elems_6871;
+        rep_i_6887 = global_tid_6886;
+        remnant_6889 = global_tid_6886 - rep_i_6887;
+        if (slt64(global_tid_6886, replicate_n_6874)) {
+            ((__global int32_t *) mem_6870)[rep_i_6887] = val_6872;
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_1:
+    return;
+}
+FUTHARK_KERNEL
+void builtinzhreplicate_i64zireplicate_6799(__local uint64_t *shared_mem_aligned, int64_t num_elems_6795, int64_t val_6796, int64_t replicate_n_6798, int64_t virt_num_tblocks_6804, int64_t num_tblocks_6805, __global unsigned char *mem_6794)
+{
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    int32_t replicate_ltid_6800;
+    int32_t tblock_sizze_6802;
+    int32_t replicate_gid_6801;
+    int32_t replicate_gtid_6799;
+    int32_t phys_tblock_id_6806;
+    int32_t iterations_6807;
+    
+    replicate_ltid_6800 = get_local_id(0);
+    tblock_sizze_6802 = get_local_size(0);
+    replicate_gid_6801 = get_tblock_id(0);
+    replicate_gtid_6799 = replicate_gid_6801 * tblock_sizze_6802 + replicate_ltid_6800;
+    phys_tblock_id_6806 = get_tblock_id(0);
+    iterations_6807 = sdiv_up32(sext_i64_i32(virt_num_tblocks_6804) - phys_tblock_id_6806, sext_i64_i32(num_tblocks_6805));
+    for (int32_t i_6808 = 0; i_6808 < iterations_6807; i_6808++) {
+        int32_t virt_tblock_id_6809;
+        int64_t global_tid_6810;
+        int64_t slice_6812;
+        int64_t rep_i_6811;
+        int64_t remnant_6813;
+        
+        virt_tblock_id_6809 = phys_tblock_id_6806 + i_6808 * sext_i64_i32(num_tblocks_6805);
+        global_tid_6810 = sext_i32_i64(virt_tblock_id_6809) * sext_i32_i64(tblock_sizze_6802) + sext_i32_i64(replicate_ltid_6800);
+        slice_6812 = num_elems_6795;
+        rep_i_6811 = global_tid_6810;
+        remnant_6813 = global_tid_6810 - rep_i_6811;
+        if (slt64(global_tid_6810, replicate_n_6798)) {
+            ((__global int64_t *) mem_6794)[rep_i_6811] = val_6796;
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_1:
+    return;
+}
+FUTHARK_KERNEL_SIZED(byte_histogramziseghist_global_6312_dim1, 1, 1)
+void byte_histogramziseghist_global_6312(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5765, int64_t num_tblocks_6307, int64_t num_subhistos_6815, int32_t chk_i_6885, int64_t hist_H_chk_6886, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816)
+{
+    #define seghist_tblock_sizze_6305 (byte_histogramziseghist_global_6312ziseghist_tblock_sizze_6305)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6888;
+    int32_t tblock_sizze_6891;
+    int32_t wave_sizze_6890;
+    int32_t block_id_6889;
+    int32_t global_tid_6887;
+    int64_t phys_tid_6312;
+    int32_t subhisto_ind_6892;
+    int64_t num_chunks_6893;
+    
+    local_tid_6888 = get_local_id(0);
+    tblock_sizze_6891 = get_local_size(0);
+    wave_sizze_6890 = LOCKSTEP_WIDTH;
+    block_id_6889 = get_tblock_id(0);
+    global_tid_6887 = block_id_6889 * tblock_sizze_6891 + local_tid_6888;
+    phys_tid_6312 = sext_i32_i64(global_tid_6887);
+    subhisto_ind_6892 = squot32(global_tid_6887, sdiv_up32(sext_i64_i32(seghist_tblock_sizze_6305 * num_tblocks_6307), sext_i64_i32(num_subhistos_6815)));
+    num_chunks_6893 = sdiv_up64(n_5765, sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6305 * num_tblocks_6307)));
+    for (int64_t chunk_i_6894 = 0; chunk_i_6894 < num_chunks_6893; chunk_i_6894++) {
+        int64_t i_6895 = chunk_i_6894 * sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6305 * num_tblocks_6307)) + sext_i32_i64(global_tid_6887);
+        
+        if (slt64(i_6895, n_5765)) {
+            int64_t slice_6896;
+            int64_t gtid_6311;
+            int64_t remnant_6897;
+            
+            slice_6896 = n_5765;
+            gtid_6311 = i_6895;
+            remnant_6897 = i_6895 - gtid_6311;
+            if (slt64(i_6895, n_5765)) {
+                int8_t eta_p_6316;
+                int64_t u8_res_6318;
+                
+                eta_p_6316 = ((__global int8_t *) xs_mem_6757)[gtid_6311];
+                u8_res_6318 = zext_i8_i64(eta_p_6316);
+                // save map-out results
+                { }
+                // perform atomic updates
+                {
+                    if (sle64(sext_i32_i64(chk_i_6885) * hist_H_chk_6886, u8_res_6318) && (slt64(u8_res_6318, sext_i32_i64(chk_i_6885) * hist_H_chk_6886 + hist_H_chk_6886) && (sle64((int64_t) 0, u8_res_6318) && slt64(u8_res_6318, (int64_t) 256)))) {
+                        int64_t eta_p_6313;
+                        int64_t eta_p_6314 = (int64_t) 1;
+                        int64_t old_6898;
+                        
+                        old_6898 = atomic_add_i64_global(&((volatile __global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(subhisto_ind_6892) * (int64_t) 256 + u8_res_6318], (int64_t) eta_p_6314);
+                    }
+                }
+            }
+        }
+    }
+    
+  error_0:
+    return;
+    #undef seghist_tblock_sizze_6305
+}
+FUTHARK_KERNEL_SIZED(byte_histogramziseghist_local_6312_dim1, 1, 1)
+void byte_histogramziseghist_local_6312(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5765, int64_t num_subhistos_6815, int64_t num_tblocks_6826, int32_t hist_M_6832, int32_t chk_i_6836, int64_t num_segments_6837, int64_t hist_H_chk_6838, int64_t histo_sizze_6839, int32_t init_per_thread_6840, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816)
+{
+    #define max_tblock_sizze_6825 (byte_histogramziseghist_local_6312zimax_tblock_sizze_6825)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *subhistogram_local_mem_6854_backing_0 = &shared_mem[0];
+    const int64_t subhistogram_local_mem_6854_backing_0_offset = 0 + ((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838) + srem64((int64_t) 8 - srem64((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838), (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6842;
+    int32_t tblock_sizze_6845;
+    int32_t wave_sizze_6844;
+    int32_t block_id_6843;
+    int32_t global_tid_6841;
+    int64_t phys_tid_6312;
+    int32_t phys_tblock_id_6846;
+    int32_t iterations_6847;
+    
+    local_tid_6842 = get_local_id(0);
+    tblock_sizze_6845 = get_local_size(0);
+    wave_sizze_6844 = LOCKSTEP_WIDTH;
+    block_id_6843 = get_tblock_id(0);
+    global_tid_6841 = block_id_6843 * tblock_sizze_6845 + local_tid_6842;
+    phys_tid_6312 = sext_i32_i64(global_tid_6841);
+    phys_tblock_id_6846 = get_tblock_id(0);
+    iterations_6847 = sdiv_up32(sext_i64_i32(num_tblocks_6826 * num_segments_6837) - phys_tblock_id_6846, sext_i64_i32(num_tblocks_6826));
+    for (int32_t i_6848 = 0; i_6848 < iterations_6847; i_6848++) {
+        int32_t virt_tblock_id_6849;
+        int32_t flat_segment_id_6850;
+        int32_t gid_in_segment_6851;
+        int32_t pgtid_in_segment_6852;
+        int32_t threads_per_segment_6853;
+        __local unsigned char *subhistogram_local_mem_6854;
+        int32_t thread_local_subhisto_i_6856;
+        int64_t num_chunks_6863;
+        
+        virt_tblock_id_6849 = phys_tblock_id_6846 + i_6848 * sext_i64_i32(num_tblocks_6826);
+        flat_segment_id_6850 = squot32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826));
+        gid_in_segment_6851 = srem32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826));
+        pgtid_in_segment_6852 = gid_in_segment_6851 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842;
+        threads_per_segment_6853 = sext_i64_i32(num_tblocks_6826 * max_tblock_sizze_6825);
+        subhistogram_local_mem_6854 = (__local unsigned char *) subhistogram_local_mem_6854_backing_0;
+        thread_local_subhisto_i_6856 = srem32(local_tid_6842, hist_M_6832);
+        // initialize histograms in shared memory
+        {
+            for (int32_t local_i_6857 = 0; local_i_6857 < init_per_thread_6840; local_i_6857++) {
+                int32_t j_6858 = local_i_6857 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842;
+                int32_t j_offset_6859 = hist_M_6832 * sext_i64_i32(histo_sizze_6839) * gid_in_segment_6851 + j_6858;
+                int32_t local_subhisto_i_6860 = squot32(j_6858, sext_i64_i32(histo_sizze_6839));
+                int32_t global_subhisto_i_6861 = squot32(j_offset_6859, sext_i64_i32(histo_sizze_6839));
+                
+                if (slt32(j_6858, hist_M_6832 * sext_i64_i32(histo_sizze_6839))) {
+                    // First subhistogram is initialised from global memory; others with neutral element.
+                    {
+                        if (global_subhisto_i_6861 == 0 && ((sle64((int64_t) 0, (int64_t) 0) && slt64((int64_t) 0, num_subhistos_6815)) && (sle64((int64_t) 0, sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838) && slt64(sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838, (int64_t) 256)))) {
+                            int64_t tmp_6862 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838];
+                            
+                            ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = tmp_6862;
+                        } else {
+                            ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = (int64_t) 0;
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        num_chunks_6863 = sdiv_up64(n_5765, sext_i32_i64(threads_per_segment_6853));
+        for (int64_t chunk_i_6864 = 0; chunk_i_6864 < num_chunks_6863; chunk_i_6864++) {
+            int64_t i_6865 = chunk_i_6864 * sext_i32_i64(threads_per_segment_6853) + sext_i32_i64(pgtid_in_segment_6852);
+            
+            if (slt64(i_6865, n_5765)) {
+                int64_t gtid_6311;
+                int8_t eta_p_6316;
+                int64_t u8_res_6318;
+                
+                gtid_6311 = i_6865;
+                eta_p_6316 = ((__global int8_t *) xs_mem_6757)[gtid_6311];
+                u8_res_6318 = zext_i8_i64(eta_p_6316);
+                if (chk_i_6836 == 0) {
+                    // save map-out results
+                    { }
+                }
+                // perform atomic updates
+                {
+                    if ((sle64((int64_t) 0, u8_res_6318) && slt64(u8_res_6318, (int64_t) 256)) && (sle64(sext_i32_i64(chk_i_6836) * hist_H_chk_6838, u8_res_6318) && slt64(u8_res_6318, sext_i32_i64(chk_i_6836) * hist_H_chk_6838 + hist_H_chk_6838))) {
+                        int64_t eta_p_6313;
+                        int64_t eta_p_6314 = (int64_t) 1;
+                        int64_t old_6866;
+                        
+                        old_6866 = atomic_add_i64_shared(&((volatile __local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(thread_local_subhisto_i_6856) * hist_H_chk_6838 + (u8_res_6318 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838)], (int64_t) eta_p_6314);
+                    }
+                }
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+        // Compact the multiple shared memory subhistograms to result in global memory
+        {
+            int64_t trunc_H_6867 = smin64(hist_H_chk_6838, (int64_t) 256 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838);
+            int32_t histo_sizze_6868 = sext_i64_i32(trunc_H_6867);
+            
+            for (int32_t local_i_6869 = 0; local_i_6869 < init_per_thread_6840; local_i_6869++) {
+                int32_t j_6870 = local_i_6869 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842;
+                
+                if (slt32(j_6870, histo_sizze_6868)) {
+                    int64_t eta_p_6313;
+                    int64_t eta_p_6314;
+                    
+                    // Read values from subhistogram 0.
+                    {
+                        eta_p_6313 = ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(j_6870)];
+                    }
+                    // Accumulate based on values in other subhistograms.
+                    {
+                        for (int32_t subhisto_id_6871 = 0; subhisto_id_6871 < hist_M_6832 - 1; subhisto_id_6871++) {
+                            eta_p_6314 = ((__local int64_t *) subhistogram_local_mem_6854)[(sext_i32_i64(subhisto_id_6871) + (int64_t) 1) * hist_H_chk_6838 + sext_i32_i64(j_6870)];
+                            
+                            int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314);
+                            
+                            eta_p_6313 = defunc_0_op_res_6315;
+                        }
+                    }
+                    // Put final bucket value in global memory.
+                    {
+                        ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[srem64(sext_i32_i64(virt_tblock_id_6849), num_tblocks_6826) * (int64_t) 256 + (sext_i32_i64(j_6870) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838)] = eta_p_6313;
+                    }
+                }
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_2:
+    return;
+    #undef max_tblock_sizze_6825
+}
+FUTHARK_KERNEL_SIZED(byte_histogramzisegred_large_6901_dim1, 1, 1)
+void byte_histogramzisegred_large_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6307, int64_t num_subhistos_6815, int64_t blocks_per_segment_6932, int64_t q_6933, int64_t num_virtblocks_6934, int64_t threads_per_segment_6935, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816, __global unsigned char *segred_tmp_mem_6936, __global unsigned char *counters_mem_6938)
+{
+    #define seghist_tblock_sizze_6305 (byte_histogramzisegred_large_6901ziseghist_tblock_sizze_6305)
+    #define chunk_sizze_6902 (byte_histogramzisegred_large_6901zichunk_sizze_6902)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *sync_arr_mem_6967_backing_1 = &shared_mem[0];
+    const int64_t sync_arr_mem_6967_backing_1_offset = 0 + 8;
+    volatile __local unsigned char *red_arr_i64_mem_6965_backing_0 = &shared_mem[sync_arr_mem_6967_backing_1_offset];
+    const int64_t red_arr_i64_mem_6965_backing_0_offset = sync_arr_mem_6967_backing_1_offset + ((int64_t) 8 * seghist_tblock_sizze_6305 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6305, (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6961;
+    int32_t tblock_sizze_6964;
+    int32_t wave_sizze_6963;
+    int32_t block_id_6962;
+    int32_t global_tid_6960;
+    int64_t flat_gtid_6901;
+    __local unsigned char *red_arr_i64_mem_6965;
+    __local unsigned char *sync_arr_mem_6967;
+    int32_t phys_tblock_id_6969;
+    int32_t iterations_6970;
+    
+    local_tid_6961 = get_local_id(0);
+    tblock_sizze_6964 = get_local_size(0);
+    wave_sizze_6963 = LOCKSTEP_WIDTH;
+    block_id_6962 = get_tblock_id(0);
+    global_tid_6960 = block_id_6962 * tblock_sizze_6964 + local_tid_6961;
+    flat_gtid_6901 = sext_i32_i64(global_tid_6960);
+    red_arr_i64_mem_6965 = (__local unsigned char *) red_arr_i64_mem_6965_backing_0;
+    sync_arr_mem_6967 = (__local unsigned char *) sync_arr_mem_6967_backing_1;
+    phys_tblock_id_6969 = get_tblock_id(0);
+    iterations_6970 = sdiv_up32(sext_i64_i32(num_virtblocks_6934) - phys_tblock_id_6969, sext_i64_i32(num_tblocks_6307));
+    for (int32_t i_6971 = 0; i_6971 < iterations_6970; i_6971++) {
+        int32_t virt_tblock_id_6972;
+        int64_t flat_segment_id_6973;
+        int64_t global_tid_6974;
+        int64_t slice_6975;
+        int64_t bucket_id_6899;
+        int64_t remnant_6976;
+        int64_t subhistogram_id_6900;
+        int64_t eta_p_block_res_acc_6977;
+        int64_t eta_p_6313;
+        int64_t eta_p_6314;
+        int64_t tblock_id_in_segment_6981;
+        int64_t block_base_offset_6982;
+        int32_t offset_6985;
+        int32_t skip_waves_6986;
+        int64_t eta_p_6978;
+        int64_t eta_p_6979;
+        
+        virt_tblock_id_6972 = phys_tblock_id_6969 + i_6971 * sext_i64_i32(num_tblocks_6307);
+        flat_segment_id_6973 = squot64(sext_i32_i64(virt_tblock_id_6972), blocks_per_segment_6932);
+        global_tid_6974 = srem64(sext_i32_i64(virt_tblock_id_6972) * seghist_tblock_sizze_6305 + sext_i32_i64(local_tid_6961), threads_per_segment_6935);
+        slice_6975 = (int64_t) 256;
+        bucket_id_6899 = flat_segment_id_6973;
+        remnant_6976 = flat_segment_id_6973 - bucket_id_6899;
+        // ne-initialise the outer (per-block) accumulator(s)
+        {
+            eta_p_block_res_acc_6977 = (int64_t) 0;
+        }
+        tblock_id_in_segment_6981 = squot64(global_tid_6974, seghist_tblock_sizze_6305);
+        block_base_offset_6982 = tblock_id_in_segment_6981 * q_6933 * seghist_tblock_sizze_6305;
+        for (int64_t i_6983 = 0; i_6983 < q_6933; i_6983++) {
+            int64_t block_offset_6984 = block_base_offset_6982 + i_6983 * seghist_tblock_sizze_6305;
+            
+            subhistogram_id_6900 = global_tid_6974 + threads_per_segment_6935 * i_6983;
+            if (slt64(subhistogram_id_6900, num_subhistos_6815)) {
+                // apply map function(s)
+                {
+                    // load accumulator(s)
+                    {
+                        eta_p_6313 = eta_p_block_res_acc_6977;
+                    }
+                    // load next value(s)
+                    {
+                        eta_p_6314 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899];
+                    }
+                    // apply reduction operator(s)
+                    {
+                        int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314);
+                        
+                        // store in accumulator(s)
+                        {
+                            eta_p_block_res_acc_6977 = defunc_0_op_res_6315;
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // store accs. prims go in lmem; non-prims in params (in global mem)
+        {
+            ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_block_res_acc_6977;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        skip_waves_6986 = 1;
+        offset_6985 = 0;
+        // participating threads read initial accumulator
+        {
+            if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6305))) {
+                eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)];
+            }
+        }
+        offset_6985 = 1;
+        while (slt32(offset_6985, wave_sizze_6963)) {
+            if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6985 - 1)) == 0) {
+                // read array element
+                {
+                    eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)];
+                }
+                // apply reduction operation
+                {
+                    int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                    
+                    eta_p_6978 = defunc_0_op_res_6980;
+                }
+                // write result of operation
+                {
+                    ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                }
+            }
+            offset_6985 *= 2;
+        }
+        while (slt32(skip_waves_6986, squot32(sext_i64_i32(seghist_tblock_sizze_6305) + wave_sizze_6963 - 1, wave_sizze_6963))) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset_6985 = skip_waves_6986 * wave_sizze_6963;
+            if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6986 - 1)) == 0)) {
+                // read array element
+                {
+                    eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)];
+                }
+                // apply reduction operation
+                {
+                    int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                    
+                    eta_p_6978 = defunc_0_op_res_6980;
+                }
+                // write result of operation
+                {
+                    ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                }
+            }
+            skip_waves_6986 *= 2;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // thread 0 updates per-block acc(s); rest reset to ne
+        {
+            if (sext_i32_i64(local_tid_6961) == (int64_t) 0) {
+                eta_p_block_res_acc_6977 = eta_p_6978;
+            } else {
+                eta_p_block_res_acc_6977 = (int64_t) 0;
+            }
+        }
+        if (blocks_per_segment_6932 == (int64_t) 1) {
+            // first thread in block saves final result to memory
+            {
+                if (local_tid_6961 == 0) {
+                    ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_block_res_acc_6977;
+                }
+            }
+        } else {
+            int32_t old_counter_6987;
+            bool is_last_block_6988;
+            
+            // first thread in block saves block result to global memory
+            {
+                if (local_tid_6961 == 0) {
+                    ((__global int64_t *) segred_tmp_mem_6936)[sext_i32_i64(virt_tblock_id_6972)] = eta_p_block_res_acc_6977;
+                    mem_fence_global();
+                    old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) 1);
+                    ((__local bool *) sync_arr_mem_6967)[(int64_t) 0] = old_counter_6987 == sext_i64_i32(blocks_per_segment_6932 - (int64_t) 1);
+                }
+            }
+            barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+            is_last_block_6988 = ((__local bool *) sync_arr_mem_6967)[(int64_t) 0];
+            if (is_last_block_6988) {
+                if (local_tid_6961 == 0) {
+                    old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) sext_i64_i32((int64_t) 0 - blocks_per_segment_6932));
+                }
+                // read in the per-block-results
+                {
+                    int64_t read_per_thread_6989 = sdiv_up64(blocks_per_segment_6932, seghist_tblock_sizze_6305);
+                    
+                    eta_p_6313 = (int64_t) 0;
+                    for (int64_t i_6990 = 0; i_6990 < read_per_thread_6989; i_6990++) {
+                        int64_t block_res_id_6991 = sext_i32_i64(local_tid_6961) * read_per_thread_6989 + i_6990;
+                        int64_t index_of_block_res_6992 = flat_segment_id_6973 * blocks_per_segment_6932 + block_res_id_6991;
+                        
+                        if (slt64(block_res_id_6991, blocks_per_segment_6932)) {
+                            eta_p_6314 = ((__global int64_t *) segred_tmp_mem_6936)[index_of_block_res_6992];
+                            
+                            int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314);
+                            
+                            eta_p_6313 = defunc_0_op_res_6315;
+                        }
+                    }
+                }
+                ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6313;
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // reduce the per-block results
+                {
+                    int32_t offset_6993;
+                    int32_t skip_waves_6994 = 1;
+                    int64_t eta_p_6978;
+                    int64_t eta_p_6979;
+                    
+                    offset_6993 = 0;
+                    // participating threads read initial accumulator
+                    {
+                        if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6305))) {
+                            eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)];
+                        }
+                    }
+                    offset_6993 = 1;
+                    while (slt32(offset_6993, wave_sizze_6963)) {
+                        if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6993 - 1)) == 0) {
+                            // read array element
+                            {
+                                eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)];
+                            }
+                            // apply reduction operation
+                            {
+                                int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                                
+                                eta_p_6978 = defunc_0_op_res_6980;
+                            }
+                            // write result of operation
+                            {
+                                ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                            }
+                        }
+                        offset_6993 *= 2;
+                    }
+                    while (slt32(skip_waves_6994, squot32(sext_i64_i32(seghist_tblock_sizze_6305) + wave_sizze_6963 - 1, wave_sizze_6963))) {
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                        offset_6993 = skip_waves_6994 * wave_sizze_6963;
+                        if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6305)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6994 - 1)) == 0)) {
+                            // read array element
+                            {
+                                eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)];
+                            }
+                            // apply reduction operation
+                            {
+                                int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                                
+                                eta_p_6978 = defunc_0_op_res_6980;
+                            }
+                            // write result of operation
+                            {
+                                ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                            }
+                        }
+                        skip_waves_6994 *= 2;
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                    // and back to memory with the final result
+                    {
+                        if (local_tid_6961 == 0) {
+                            ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_6978;
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_6:
+    return;
+    #undef seghist_tblock_sizze_6305
+    #undef chunk_sizze_6902
+}
+FUTHARK_KERNEL_SIZED(byte_histogramzisegred_small_6901_dim1, 1, 1)
+void byte_histogramzisegred_small_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6307, int64_t num_subhistos_6815, int64_t segment_sizze_nonzzero_6903, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816)
+{
+    #define seghist_tblock_sizze_6305 (byte_histogramzisegred_small_6901ziseghist_tblock_sizze_6305)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *red_arr_i64_mem_6910_backing_0 = &shared_mem[0];
+    const int64_t red_arr_i64_mem_6910_backing_0_offset = 0 + ((int64_t) 8 * seghist_tblock_sizze_6305 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6305, (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6906;
+    int32_t tblock_sizze_6909;
+    int32_t wave_sizze_6908;
+    int32_t block_id_6907;
+    int32_t global_tid_6905;
+    int64_t flat_gtid_6901;
+    __local unsigned char *red_arr_i64_mem_6910;
+    int32_t phys_tblock_id_6912;
+    int32_t iterations_6913;
+    
+    local_tid_6906 = get_local_id(0);
+    tblock_sizze_6909 = get_local_size(0);
+    wave_sizze_6908 = LOCKSTEP_WIDTH;
+    block_id_6907 = get_tblock_id(0);
+    global_tid_6905 = block_id_6907 * tblock_sizze_6909 + local_tid_6906;
+    flat_gtid_6901 = sext_i32_i64(global_tid_6905);
+    red_arr_i64_mem_6910 = (__local unsigned char *) red_arr_i64_mem_6910_backing_0;
+    phys_tblock_id_6912 = get_tblock_id(0);
+    iterations_6913 = sdiv_up32(sext_i64_i32(sdiv_up64((int64_t) 256, squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903))) - phys_tblock_id_6912, sext_i64_i32(num_tblocks_6307));
+    for (int32_t i_6914 = 0; i_6914 < iterations_6913; i_6914++) {
+        int32_t virt_tblock_id_6915;
+        int64_t slice_6916;
+        int64_t bucket_id_6899;
+        int64_t remnant_6917;
+        int64_t subhistogram_id_6900;
+        
+        virt_tblock_id_6915 = phys_tblock_id_6912 + i_6914 * sext_i64_i32(num_tblocks_6307);
+        slice_6916 = (int64_t) 256;
+        bucket_id_6899 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903);
+        remnant_6917 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903) - bucket_id_6899;
+        subhistogram_id_6900 = srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815);
+        // apply map function if in bounds
+        {
+            if (slt64((int64_t) 0, num_subhistos_6815) && (slt64(bucket_id_6899, (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903)))) {
+                // save results to be reduced
+                {
+                    int64_t tmp_6918 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899];
+                    
+                    ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = tmp_6918;
+                }
+            } else {
+                ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = (int64_t) 0;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (slt64((int64_t) 0, num_subhistos_6815)) {
+            // perform segmented scan to imitate reduction
+            {
+                int64_t eta_p_6313;
+                int64_t eta_p_6314;
+                int64_t eta_p_6919;
+                int64_t eta_p_6920;
+                bool ltid_in_bounds_6922 = slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903));
+                int32_t skip_threads_6923;
+                
+                // read input for in-block scan
+                {
+                    if (ltid_in_bounds_6922) {
+                        eta_p_6314 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)];
+                        if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) {
+                            eta_p_6313 = eta_p_6314;
+                        }
+                    }
+                }
+                // in-block scan (hopefully no barriers needed)
+                {
+                    skip_threads_6923 = 1;
+                    while (slt32(skip_threads_6923, 32)) {
+                        bool thread_active_6924 = sle32(skip_threads_6923, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && ltid_in_bounds_6922;
+                        
+                        if (thread_active_6924) {
+                            // read operands
+                            {
+                                eta_p_6313 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6923)];
+                            }
+                        }
+                        // perform operation
+                        {
+                            bool inactive_6925 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(local_tid_6906 - skip_threads_6923));
+                            
+                            if (thread_active_6924 && inactive_6925) {
+                                eta_p_6313 = eta_p_6314;
+                            }
+                            if (thread_active_6924) {
+                                if (!inactive_6925) {
+                                    int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314);
+                                    
+                                    eta_p_6313 = defunc_0_op_res_6315;
+                                }
+                            }
+                        }
+                        if (sle32(wave_sizze_6908, skip_threads_6923)) {
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                        }
+                        if (thread_active_6924) {
+                            // write result
+                            {
+                                ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6313;
+                                eta_p_6314 = eta_p_6313;
+                            }
+                        }
+                        if (sle32(wave_sizze_6908, skip_threads_6923)) {
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                        }
+                        skip_threads_6923 *= 2;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // last thread of block 'i' writes its result to offset 'i'
+                {
+                    if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 31 && ltid_in_bounds_6922) {
+                        ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32))] = eta_p_6313;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // scan the first block, after which offset 'i' contains carry-in for block 'i+1'
+                {
+                    int32_t skip_threads_6926;
+                    
+                    // read input for in-block scan
+                    {
+                        if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) {
+                            eta_p_6920 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)];
+                            if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) {
+                                eta_p_6919 = eta_p_6920;
+                            }
+                        }
+                    }
+                    // in-block scan (hopefully no barriers needed)
+                    {
+                        skip_threads_6926 = 1;
+                        while (slt32(skip_threads_6926, 32)) {
+                            bool thread_active_6927 = sle32(skip_threads_6926, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922);
+                            
+                            if (thread_active_6927) {
+                                // read operands
+                                {
+                                    eta_p_6919 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6926)];
+                                }
+                            }
+                            // perform operation
+                            {
+                                bool inactive_6928 = slt64(srem64(sext_i32_i64(local_tid_6906 * 32 + 32 - 1), num_subhistos_6815), sext_i32_i64(local_tid_6906 * 32 + 32 - 1) - sext_i32_i64((local_tid_6906 - skip_threads_6926) * 32 + 32 - 1));
+                                
+                                if (thread_active_6927 && inactive_6928) {
+                                    eta_p_6919 = eta_p_6920;
+                                }
+                                if (thread_active_6927) {
+                                    if (!inactive_6928) {
+                                        int64_t defunc_0_op_res_6921 = add64(eta_p_6919, eta_p_6920);
+                                        
+                                        eta_p_6919 = defunc_0_op_res_6921;
+                                    }
+                                }
+                            }
+                            if (sle32(wave_sizze_6908, skip_threads_6926)) {
+                                barrier(CLK_LOCAL_MEM_FENCE);
+                            }
+                            if (thread_active_6927) {
+                                // write result
+                                {
+                                    ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6919;
+                                    eta_p_6920 = eta_p_6919;
+                                }
+                            }
+                            if (sle32(wave_sizze_6908, skip_threads_6926)) {
+                                barrier(CLK_LOCAL_MEM_FENCE);
+                            }
+                            skip_threads_6926 *= 2;
+                        }
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                
+                bool no_carry_in_6929 = squot32(local_tid_6906, 32) == 0 || !ltid_in_bounds_6922;
+                
+                // carry-in for every block except the first
+                {
+                    // read operands
+                    {
+                        if (!no_carry_in_6929) {
+                            eta_p_6314 = eta_p_6313;
+                            eta_p_6313 = ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32)) - (int64_t) 1];
+                        }
+                    }
+                    // perform operation
+                    {
+                        bool inactive_6930 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(squot32(local_tid_6906, 32) * 32 - 1));
+                        
+                        if (!no_carry_in_6929) {
+                            if (inactive_6930) {
+                                eta_p_6313 = eta_p_6314;
+                            }
+                        }
+                        if (!no_carry_in_6929) {
+                            if (!inactive_6930) {
+                                int64_t defunc_0_op_res_6315 = add64(eta_p_6313, eta_p_6314);
+                                
+                                eta_p_6313 = defunc_0_op_res_6315;
+                            }
+                        }
+                    }
+                    // write final result
+                    {
+                        if (!no_carry_in_6929) {
+                            ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6313;
+                        }
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // restore correct values for first block
+                {
+                    if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) {
+                        ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6314;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // save final values of segments
+        {
+            if (slt64(sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906), (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903))) {
+                int64_t tmp_6931 = ((__local int64_t *) red_arr_i64_mem_6910)[(sext_i32_i64(local_tid_6906) + (int64_t) 1) * segment_sizze_nonzzero_6903 - (int64_t) 1];
+                
+                ((__global int64_t *) mem_6758)[sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6305, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906)] = tmp_6931;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_3:
+    return;
+    #undef seghist_tblock_sizze_6305
+}
+FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6405_dim1, 1, 1)
+void chunked_entropyzisegmap_6405(__local uint64_t *shared_mem_aligned, __global int *global_failure, int failure_is_an_option, __global int64_t *global_failure_args, int64_t n_6046, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, __global unsigned char *xs_mem_6757, __global unsigned char *mem_6791)
+{
+    #define segmap_tblock_sizze_6401 (chunked_entropyzisegmap_6405zisegmap_tblock_sizze_6401)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6799;
+    int32_t tblock_sizze_6802;
+    int32_t wave_sizze_6801;
+    int32_t block_id_6800;
+    int32_t global_tid_6798;
+    int64_t phys_tid_6405;
+    int64_t global_tid_6803;
+    int64_t slice_6804;
+    int64_t gtid_6404;
+    int64_t remnant_6805;
+    
+    local_tid_6799 = get_local_id(0);
+    tblock_sizze_6802 = get_local_size(0);
+    wave_sizze_6801 = LOCKSTEP_WIDTH;
+    block_id_6800 = get_tblock_id(0);
+    global_tid_6798 = block_id_6800 * tblock_sizze_6802 + local_tid_6799;
+    phys_tid_6405 = sext_i32_i64(global_tid_6798);
+    global_tid_6803 = sext_i32_i64(block_id_6800) * segmap_tblock_sizze_6401 + sext_i32_i64(local_tid_6799);
+    slice_6804 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197;
+    gtid_6404 = global_tid_6803;
+    remnant_6805 = global_tid_6803 - gtid_6404;
+    if (slt64(gtid_6404, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) {
+        int64_t entropy_arg0_6407;
+        int64_t zt_lhs_6408;
+        int64_t entropy_arg0_6409;
+        int64_t j_m_i_6410;
+        bool empty_slice_6411;
+        int64_t m_6412;
+        int64_t i_p_m_t_s_6413;
+        bool zzero_leq_i_p_m_t_s_6414;
+        bool i_p_m_t_s_leq_w_6415;
+        bool zzero_lte_i_6416;
+        bool i_lte_j_6417;
+        bool y_6418;
+        bool y_6419;
+        bool forwards_ok_6420;
+        bool ok_or_empty_6421;
+        bool index_certs_6422;
+        int64_t mem_6788[(int64_t) 256];
+        float i64_res_6433;
+        float defunc_0_f_res_6434;
+        float redout_6742;
+        float zs_lhs_6445;
+        float log2_res_6446;
+        float lifted_lambda_res_6447;
+        float floor_arg0_6448;
+        float floor_res_6449;
+        int8_t unsign_arg0_6450;
+        
+        entropy_arg0_6407 = mul64(chunk_sizze_6047, gtid_6404);
+        zt_lhs_6408 = add64((int64_t) 1, gtid_6404);
+        entropy_arg0_6409 = mul64(chunk_sizze_6047, zt_lhs_6408);
+        j_m_i_6410 = sub64(entropy_arg0_6409, entropy_arg0_6407);
+        empty_slice_6411 = j_m_i_6410 == (int64_t) 0;
+        m_6412 = sub64(j_m_i_6410, (int64_t) 1);
+        i_p_m_t_s_6413 = add64(entropy_arg0_6407, m_6412);
+        zzero_leq_i_p_m_t_s_6414 = sle64((int64_t) 0, i_p_m_t_s_6413);
+        i_p_m_t_s_leq_w_6415 = slt64(i_p_m_t_s_6413, n_6046);
+        zzero_lte_i_6416 = sle64((int64_t) 0, entropy_arg0_6407);
+        i_lte_j_6417 = sle64(entropy_arg0_6407, entropy_arg0_6409);
+        y_6418 = i_p_m_t_s_leq_w_6415 && zzero_lte_i_6416;
+        y_6419 = zzero_leq_i_p_m_t_s_6414 && y_6418;
+        forwards_ok_6420 = i_lte_j_6417 && y_6419;
+        ok_or_empty_6421 = empty_slice_6411 || forwards_ok_6420;
+        if (!ok_or_empty_6421) {
+            {
+                if (atomic_cmpxchg_i32_global(global_failure, -1, 0) == -1) {
+                    global_failure_args[0] = (int64_t) entropy_arg0_6407;
+                    global_failure_args[1] = (int64_t) entropy_arg0_6409;
+                    global_failure_args[2] = (int64_t) n_6046;
+                    ;
+                }
+                return;
+            }
+        }
+        for (int64_t nest_i_6806 = 0; nest_i_6806 < (int64_t) 256; nest_i_6806++) {
+            mem_6788[nest_i_6806] = (int64_t) 0;
+        }
+        for (int64_t iter_6731 = 0; iter_6731 < j_m_i_6410; iter_6731++) {
+            int64_t slice_6755;
+            int8_t pixel_6733;
+            int64_t u8_res_6432;
+            bool less_than_zzero_6735;
+            bool greater_than_sizze_6736;
+            bool outside_bounds_dim_6737;
+            
+            slice_6755 = entropy_arg0_6407 + iter_6731;
+            pixel_6733 = ((__global int8_t *) xs_mem_6757)[slice_6755];
+            u8_res_6432 = zext_i8_i64(pixel_6733);
+            less_than_zzero_6735 = slt64(u8_res_6432, (int64_t) 0);
+            greater_than_sizze_6736 = sle64((int64_t) 256, u8_res_6432);
+            outside_bounds_dim_6737 = less_than_zzero_6735 || greater_than_sizze_6736;
+            if (!outside_bounds_dim_6737) {
+                int64_t read_hist_6739;
+                int64_t defunc_0_op_res_6429;
+                
+                read_hist_6739 = mem_6788[u8_res_6432];
+                defunc_0_op_res_6429 = add64((int64_t) 1, read_hist_6739);
+                mem_6788[u8_res_6432] = defunc_0_op_res_6429;
+            }
+        }
+        i64_res_6433 = sitofp_i64_f32(j_m_i_6410);
+        redout_6742 = 0.0F;
+        for (int64_t i_6743 = 0; i_6743 < (int64_t) 256; i_6743++) {
+            int64_t eta_p_6438;
+            float i64_res_6439;
+            float lifted_lambda_res_6440;
+            bool cond_6441;
+            float lifted_lambda_res_6442;
+            float defunc_0_op_res_6437;
+            float redout_tmp_6808;
+            
+            eta_p_6438 = mem_6788[i_6743];
+            i64_res_6439 = sitofp_i64_f32(eta_p_6438);
+            lifted_lambda_res_6440 = i64_res_6439 / i64_res_6433;
+            cond_6441 = lifted_lambda_res_6440 == 0.0F;
+            if (cond_6441) {
+                lifted_lambda_res_6442 = 0.0F;
+            } else {
+                float log2_res_6443;
+                float lifted_lambda_res_f_res_6444;
+                
+                log2_res_6443 = futrts_log2_32(lifted_lambda_res_6440);
+                lifted_lambda_res_f_res_6444 = lifted_lambda_res_6440 * log2_res_6443;
+                lifted_lambda_res_6442 = lifted_lambda_res_f_res_6444;
+            }
+            defunc_0_op_res_6437 = lifted_lambda_res_6442 + redout_6742;
+            redout_tmp_6808 = defunc_0_op_res_6437;
+            redout_6742 = redout_tmp_6808;
+        }
+        defunc_0_f_res_6434 = redout_6742;
+        zs_lhs_6445 = -1.0F * defunc_0_f_res_6434;
+        log2_res_6446 = futrts_log2_32(i64_res_6433);
+        lifted_lambda_res_6447 = zs_lhs_6445 / log2_res_6446;
+        floor_arg0_6448 = 255.0F * lifted_lambda_res_6447;
+        floor_res_6449 = futrts_floor32(floor_arg0_6448);
+        unsign_arg0_6450 = fptoui_f32_i8(floor_res_6449);
+        ((__global int8_t *) mem_6791)[gtid_6404] = unsign_arg0_6450;
+    }
+    
+  error_0:
+    return;
+    #undef segmap_tblock_sizze_6401
+}
+FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6606_dim1, 1, 1)
+void chunked_entropyzisegmap_6606(__local uint64_t *shared_mem_aligned, __global int *global_failure, int failure_is_an_option, __global int64_t *global_failure_args, int64_t n_6046, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, __global unsigned char *mem_6759)
+{
+    #define segmap_tblock_sizze_6600 (chunked_entropyzisegmap_6606zisegmap_tblock_sizze_6600)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6811;
+    int32_t tblock_sizze_6814;
+    int32_t wave_sizze_6813;
+    int32_t block_id_6812;
+    int32_t global_tid_6810;
+    int64_t phys_tid_6606;
+    int64_t global_tid_6815;
+    int64_t slice_6816;
+    int64_t gtid_6605;
+    int64_t remnant_6817;
+    
+    local_tid_6811 = get_local_id(0);
+    tblock_sizze_6814 = get_local_size(0);
+    wave_sizze_6813 = LOCKSTEP_WIDTH;
+    block_id_6812 = get_tblock_id(0);
+    global_tid_6810 = block_id_6812 * tblock_sizze_6814 + local_tid_6811;
+    phys_tid_6606 = sext_i32_i64(global_tid_6810);
+    global_tid_6815 = sext_i32_i64(block_id_6812) * segmap_tblock_sizze_6600 + sext_i32_i64(local_tid_6811);
+    slice_6816 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197;
+    gtid_6605 = global_tid_6815;
+    remnant_6817 = global_tid_6815 - gtid_6605;
+    if (slt64(gtid_6605, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) {
+        int64_t entropy_arg0_6608;
+        int64_t zt_lhs_6609;
+        int64_t entropy_arg0_6610;
+        int64_t j_m_i_6611;
+        bool empty_slice_6612;
+        int64_t m_6613;
+        int64_t i_p_m_t_s_6614;
+        bool zzero_leq_i_p_m_t_s_6615;
+        bool i_p_m_t_s_leq_w_6616;
+        bool zzero_lte_i_6617;
+        bool i_lte_j_6618;
+        bool y_6619;
+        bool y_6620;
+        bool forwards_ok_6621;
+        bool ok_or_empty_6622;
+        bool index_certs_6623;
+        
+        entropy_arg0_6608 = mul64(chunk_sizze_6047, gtid_6605);
+        zt_lhs_6609 = add64((int64_t) 1, gtid_6605);
+        entropy_arg0_6610 = mul64(chunk_sizze_6047, zt_lhs_6609);
+        j_m_i_6611 = sub64(entropy_arg0_6610, entropy_arg0_6608);
+        empty_slice_6612 = j_m_i_6611 == (int64_t) 0;
+        m_6613 = sub64(j_m_i_6611, (int64_t) 1);
+        i_p_m_t_s_6614 = add64(entropy_arg0_6608, m_6613);
+        zzero_leq_i_p_m_t_s_6615 = sle64((int64_t) 0, i_p_m_t_s_6614);
+        i_p_m_t_s_leq_w_6616 = slt64(i_p_m_t_s_6614, n_6046);
+        zzero_lte_i_6617 = sle64((int64_t) 0, entropy_arg0_6608);
+        i_lte_j_6618 = sle64(entropy_arg0_6608, entropy_arg0_6610);
+        y_6619 = i_p_m_t_s_leq_w_6616 && zzero_lte_i_6617;
+        y_6620 = zzero_leq_i_p_m_t_s_6615 && y_6619;
+        forwards_ok_6621 = i_lte_j_6618 && y_6620;
+        ok_or_empty_6622 = empty_slice_6612 || forwards_ok_6621;
+        if (!ok_or_empty_6622) {
+            {
+                if (atomic_cmpxchg_i32_global(global_failure, -1, 1) == -1) {
+                    global_failure_args[0] = (int64_t) entropy_arg0_6608;
+                    global_failure_args[1] = (int64_t) entropy_arg0_6610;
+                    global_failure_args[2] = (int64_t) n_6046;
+                    ;
+                }
+                return;
+            }
+        }
+    }
+    
+  error_0:
+    return;
+    #undef segmap_tblock_sizze_6600
+}
+FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6645_dim1, 1, 1)
+void chunked_entropyzisegmap_6645(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, int64_t num_tblocks_6640, int32_t virt_num_tblocks_6818, __global unsigned char *xs_mem_6757, __global unsigned char *mem_6759, __global unsigned char *mem_6777)
+{
+    #define segmap_tblock_sizze_6639 (chunked_entropyzisegmap_6645zisegmap_tblock_sizze_6639)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6820;
+    int32_t tblock_sizze_6823;
+    int32_t wave_sizze_6822;
+    int32_t block_id_6821;
+    int32_t global_tid_6819;
+    int64_t phys_tid_6645;
+    int32_t phys_tblock_id_6824;
+    int32_t iterations_6825;
+    
+    local_tid_6820 = get_local_id(0);
+    tblock_sizze_6823 = get_local_size(0);
+    wave_sizze_6822 = LOCKSTEP_WIDTH;
+    block_id_6821 = get_tblock_id(0);
+    global_tid_6819 = block_id_6821 * tblock_sizze_6823 + local_tid_6820;
+    phys_tid_6645 = sext_i32_i64(global_tid_6819);
+    phys_tblock_id_6824 = get_tblock_id(0);
+    iterations_6825 = sdiv_up32(virt_num_tblocks_6818 - phys_tblock_id_6824, sext_i64_i32(num_tblocks_6640));
+    for (int32_t i_6826 = 0; i_6826 < iterations_6825; i_6826++) {
+        int32_t virt_tblock_id_6827;
+        int64_t global_tid_6828;
+        int64_t slice_6829;
+        int64_t gtid_6644;
+        int64_t remnant_6830;
+        
+        virt_tblock_id_6827 = phys_tblock_id_6824 + i_6826 * sext_i64_i32(num_tblocks_6640);
+        global_tid_6828 = sext_i32_i64(virt_tblock_id_6827) * segmap_tblock_sizze_6639 + sext_i32_i64(local_tid_6820);
+        slice_6829 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197;
+        gtid_6644 = global_tid_6828;
+        remnant_6830 = global_tid_6828 - gtid_6644;
+        if (slt64(gtid_6644, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) {
+            int64_t index_primexp_6712;
+            int64_t binop_y_6704;
+            int64_t binop_x_6705;
+            int64_t index_primexp_6709;
+            bool index_certs_6648;
+            int64_t mem_6765[(int64_t) 256];
+            
+            index_primexp_6712 = mul64(chunk_sizze_6047, gtid_6644);
+            binop_y_6704 = add64((int64_t) 1, gtid_6644);
+            binop_x_6705 = mul64(chunk_sizze_6047, binop_y_6704);
+            index_primexp_6709 = sub64(binop_x_6705, index_primexp_6712);
+            index_certs_6648 = 0;
+            for (int64_t nest_i_6831 = 0; nest_i_6831 < (int64_t) 256; nest_i_6831++) {
+                mem_6765[nest_i_6831] = (int64_t) 0;
+            }
+            for (int64_t iter_6744 = 0; iter_6744 < index_primexp_6709; iter_6744++) {
+                int64_t slice_6756;
+                int8_t pixel_6746;
+                int64_t u8_res_6659;
+                bool less_than_zzero_6748;
+                bool greater_than_sizze_6749;
+                bool outside_bounds_dim_6750;
+                
+                slice_6756 = index_primexp_6712 + iter_6744;
+                pixel_6746 = ((__global int8_t *) xs_mem_6757)[slice_6756];
+                u8_res_6659 = zext_i8_i64(pixel_6746);
+                less_than_zzero_6748 = slt64(u8_res_6659, (int64_t) 0);
+                greater_than_sizze_6749 = sle64((int64_t) 256, u8_res_6659);
+                outside_bounds_dim_6750 = less_than_zzero_6748 || greater_than_sizze_6749;
+                if (!outside_bounds_dim_6750) {
+                    int64_t read_hist_6752;
+                    int64_t defunc_0_op_res_6656;
+                    
+                    read_hist_6752 = mem_6765[u8_res_6659];
+                    defunc_0_op_res_6656 = add64((int64_t) 1, read_hist_6752);
+                    mem_6765[u8_res_6659] = defunc_0_op_res_6656;
+                }
+            }
+            for (int64_t i_0 = 0; i_0 < (int64_t) 256; i_0++) {
+                ((__global int64_t *) mem_6777)[gtid_6644 + i_0 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197] = mem_6765[i_0];
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_1:
+    return;
+    #undef segmap_tblock_sizze_6639
+}
+FUTHARK_KERNEL_SIZED(chunked_entropyzisegmap_6687_dim1, 1, 1)
+void chunked_entropyzisegmap_6687(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, __global unsigned char *mem_6780, __global unsigned char *mem_6782)
+{
+    #define segmap_tblock_sizze_6683 (chunked_entropyzisegmap_6687zisegmap_tblock_sizze_6683)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6927;
+    int32_t tblock_sizze_6930;
+    int32_t wave_sizze_6929;
+    int32_t block_id_6928;
+    int32_t global_tid_6926;
+    int64_t phys_tid_6687;
+    int64_t global_tid_6931;
+    int64_t slice_6932;
+    int64_t gtid_6686;
+    int64_t remnant_6933;
+    
+    local_tid_6927 = get_local_id(0);
+    tblock_sizze_6930 = get_local_size(0);
+    wave_sizze_6929 = LOCKSTEP_WIDTH;
+    block_id_6928 = get_tblock_id(0);
+    global_tid_6926 = block_id_6928 * tblock_sizze_6930 + local_tid_6927;
+    phys_tid_6687 = sext_i32_i64(global_tid_6926);
+    global_tid_6931 = sext_i32_i64(block_id_6928) * segmap_tblock_sizze_6683 + sext_i32_i64(local_tid_6927);
+    slice_6932 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197;
+    gtid_6686 = global_tid_6931;
+    remnant_6933 = global_tid_6931 - gtid_6686;
+    if (slt64(gtid_6686, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197)) {
+        int64_t binop_y_6715;
+        int64_t binop_x_6716;
+        int64_t binop_y_6719;
+        int64_t convop_x_6720;
+        float index_primexp_6721;
+        float defunc_0_f_res_6689;
+        float zs_lhs_6690;
+        float log2_res_6691;
+        float lifted_lambda_res_6692;
+        float floor_arg0_6693;
+        float floor_res_6694;
+        int8_t unsign_arg0_6695;
+        
+        binop_y_6715 = add64((int64_t) 1, gtid_6686);
+        binop_x_6716 = mul64(chunk_sizze_6047, binop_y_6715);
+        binop_y_6719 = mul64(chunk_sizze_6047, gtid_6686);
+        convop_x_6720 = sub64(binop_x_6716, binop_y_6719);
+        index_primexp_6721 = sitofp_i64_f32(convop_x_6720);
+        defunc_0_f_res_6689 = ((__global float *) mem_6780)[gtid_6686];
+        zs_lhs_6690 = -1.0F * defunc_0_f_res_6689;
+        log2_res_6691 = futrts_log2_32(index_primexp_6721);
+        lifted_lambda_res_6692 = zs_lhs_6690 / log2_res_6691;
+        floor_arg0_6693 = 255.0F * lifted_lambda_res_6692;
+        floor_res_6694 = futrts_floor32(floor_arg0_6693);
+        unsign_arg0_6695 = fptoui_f32_i8(floor_res_6694);
+        ((__global int8_t *) mem_6782)[gtid_6686] = unsign_arg0_6695;
+    }
+    
+  error_0:
+    return;
+    #undef segmap_tblock_sizze_6683
+}
+FUTHARK_KERNEL_SIZED(chunked_entropyzisegred_large_6669_dim1, 1, 1)
+void chunked_entropyzisegred_large_6669(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, int64_t num_tblocks_6664, int64_t blocks_per_segment_6862, int64_t q_6863, int64_t num_virtblocks_6864, int64_t threads_per_segment_6865, __global unsigned char *mem_6777, __global unsigned char *mem_6780, __global unsigned char *segred_tmp_mem_6866, __global unsigned char *counters_mem_6868)
+{
+    #define segred_tblock_sizze_6663 (chunked_entropyzisegred_large_6669zisegred_tblock_sizze_6663)
+    #define chunk_sizze_6833 (chunked_entropyzisegred_large_6669zichunk_sizze_6833)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *sync_arr_mem_6897_backing_1 = &shared_mem[0];
+    const int64_t sync_arr_mem_6897_backing_1_offset = 0 + 8;
+    volatile __local unsigned char *red_arr_f32_mem_6895_backing_0 = &shared_mem[sync_arr_mem_6897_backing_1_offset];
+    const int64_t red_arr_f32_mem_6895_backing_0_offset = sync_arr_mem_6897_backing_1_offset + ((int64_t) 4 * segred_tblock_sizze_6663 + srem64((int64_t) 8 - srem64((int64_t) 4 * segred_tblock_sizze_6663, (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6891;
+    int32_t tblock_sizze_6894;
+    int32_t wave_sizze_6893;
+    int32_t block_id_6892;
+    int32_t global_tid_6890;
+    int64_t phys_tid_6669;
+    __local unsigned char *red_arr_f32_mem_6895;
+    __local unsigned char *sync_arr_mem_6897;
+    int32_t phys_tblock_id_6899;
+    int32_t iterations_6900;
+    
+    local_tid_6891 = get_local_id(0);
+    tblock_sizze_6894 = get_local_size(0);
+    wave_sizze_6893 = LOCKSTEP_WIDTH;
+    block_id_6892 = get_tblock_id(0);
+    global_tid_6890 = block_id_6892 * tblock_sizze_6894 + local_tid_6891;
+    phys_tid_6669 = sext_i32_i64(global_tid_6890);
+    red_arr_f32_mem_6895 = (__local unsigned char *) red_arr_f32_mem_6895_backing_0;
+    sync_arr_mem_6897 = (__local unsigned char *) sync_arr_mem_6897_backing_1;
+    phys_tblock_id_6899 = get_tblock_id(0);
+    iterations_6900 = sdiv_up32(sext_i64_i32(num_virtblocks_6864) - phys_tblock_id_6899, sext_i64_i32(num_tblocks_6664));
+    for (int32_t i_6901 = 0; i_6901 < iterations_6900; i_6901++) {
+        int32_t virt_tblock_id_6902;
+        int64_t flat_segment_id_6903;
+        int64_t global_tid_6904;
+        int64_t slice_6905;
+        int64_t gtid_6667;
+        int64_t remnant_6906;
+        int64_t gtid_6668;
+        float eta_p_block_res_acc_6907;
+        float eta_p_6670;
+        float eta_p_6671;
+        int64_t tblock_id_in_segment_6911;
+        int64_t block_base_offset_6912;
+        int32_t offset_6915;
+        int32_t skip_waves_6916;
+        float eta_p_6908;
+        float eta_p_6909;
+        
+        virt_tblock_id_6902 = phys_tblock_id_6899 + i_6901 * sext_i64_i32(num_tblocks_6664);
+        flat_segment_id_6903 = squot64(sext_i32_i64(virt_tblock_id_6902), blocks_per_segment_6862);
+        global_tid_6904 = srem64(sext_i32_i64(virt_tblock_id_6902) * segred_tblock_sizze_6663 + sext_i32_i64(local_tid_6891), threads_per_segment_6865);
+        slice_6905 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197;
+        gtid_6667 = flat_segment_id_6903;
+        remnant_6906 = flat_segment_id_6903 - gtid_6667;
+        // ne-initialise the outer (per-block) accumulator(s)
+        {
+            eta_p_block_res_acc_6907 = 0.0F;
+        }
+        tblock_id_in_segment_6911 = squot64(global_tid_6904, segred_tblock_sizze_6663);
+        block_base_offset_6912 = tblock_id_in_segment_6911 * q_6863 * segred_tblock_sizze_6663;
+        for (int64_t i_6913 = 0; i_6913 < q_6863; i_6913++) {
+            int64_t block_offset_6914 = block_base_offset_6912 + i_6913 * segred_tblock_sizze_6663;
+            
+            gtid_6668 = global_tid_6904 + threads_per_segment_6865 * i_6913;
+            if (slt64(gtid_6668, (int64_t) 256)) {
+                // apply map function(s)
+                {
+                    // apply map function
+                    {
+                        int64_t binop_y_6724 = add64((int64_t) 1, gtid_6667);
+                        int64_t binop_x_6725 = mul64(chunk_sizze_6047, binop_y_6724);
+                        int64_t binop_y_6728 = mul64(chunk_sizze_6047, gtid_6667);
+                        int64_t convop_x_6729 = sub64(binop_x_6725, binop_y_6728);
+                        float index_primexp_6730 = sitofp_i64_f32(convop_x_6729);
+                        int64_t eta_p_6675 = ((__global int64_t *) mem_6777)[gtid_6667 + gtid_6668 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197];
+                        float i64_res_6676 = sitofp_i64_f32(eta_p_6675);
+                        float lifted_lambda_res_6677 = i64_res_6676 / index_primexp_6730;
+                        bool cond_6678 = lifted_lambda_res_6677 == 0.0F;
+                        float lifted_lambda_res_6679;
+                        
+                        if (cond_6678) {
+                            lifted_lambda_res_6679 = 0.0F;
+                        } else {
+                            float log2_res_6680 = futrts_log2_32(lifted_lambda_res_6677);
+                            float lifted_lambda_res_f_res_6681 = lifted_lambda_res_6677 * log2_res_6680;
+                            
+                            lifted_lambda_res_6679 = lifted_lambda_res_f_res_6681;
+                        }
+                        // load accumulator(s)
+                        {
+                            eta_p_6670 = eta_p_block_res_acc_6907;
+                        }
+                        // load next value(s)
+                        {
+                            eta_p_6671 = lifted_lambda_res_6679;
+                        }
+                        // apply reduction operator(s)
+                        {
+                            float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671;
+                            
+                            // store in accumulator(s)
+                            {
+                                eta_p_block_res_acc_6907 = defunc_0_op_res_6672;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // store accs. prims go in lmem; non-prims in params (in global mem)
+        {
+            ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_block_res_acc_6907;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        skip_waves_6916 = 1;
+        offset_6915 = 0;
+        // participating threads read initial accumulator
+        {
+            if (slt32(local_tid_6891, sext_i64_i32(segred_tblock_sizze_6663))) {
+                eta_p_6908 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6915)];
+            }
+        }
+        offset_6915 = 1;
+        while (slt32(offset_6915, wave_sizze_6893)) {
+            if (slt32(local_tid_6891 + offset_6915, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) & (2 * offset_6915 - 1)) == 0) {
+                // read array element
+                {
+                    eta_p_6909 = ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6915)];
+                }
+                // apply reduction operation
+                {
+                    float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909;
+                    
+                    eta_p_6908 = defunc_0_op_res_6910;
+                }
+                // write result of operation
+                {
+                    ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908;
+                }
+            }
+            offset_6915 *= 2;
+        }
+        while (slt32(skip_waves_6916, squot32(sext_i64_i32(segred_tblock_sizze_6663) + wave_sizze_6893 - 1, wave_sizze_6893))) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset_6915 = skip_waves_6916 * wave_sizze_6893;
+            if (slt32(local_tid_6891 + offset_6915, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) == 0 && (squot32(local_tid_6891, wave_sizze_6893) & (2 * skip_waves_6916 - 1)) == 0)) {
+                // read array element
+                {
+                    eta_p_6909 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6915)];
+                }
+                // apply reduction operation
+                {
+                    float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909;
+                    
+                    eta_p_6908 = defunc_0_op_res_6910;
+                }
+                // write result of operation
+                {
+                    ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908;
+                }
+            }
+            skip_waves_6916 *= 2;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // thread 0 updates per-block acc(s); rest reset to ne
+        {
+            if (sext_i32_i64(local_tid_6891) == (int64_t) 0) {
+                eta_p_block_res_acc_6907 = eta_p_6908;
+            } else {
+                eta_p_block_res_acc_6907 = 0.0F;
+            }
+        }
+        if (blocks_per_segment_6862 == (int64_t) 1) {
+            // first thread in block saves final result to memory
+            {
+                if (local_tid_6891 == 0) {
+                    ((__global float *) mem_6780)[gtid_6667] = eta_p_block_res_acc_6907;
+                }
+            }
+        } else {
+            int32_t old_counter_6917;
+            bool is_last_block_6918;
+            
+            // first thread in block saves block result to global memory
+            {
+                if (local_tid_6891 == 0) {
+                    ((__global float *) segred_tmp_mem_6866)[sext_i32_i64(virt_tblock_id_6902)] = eta_p_block_res_acc_6907;
+                    mem_fence_global();
+                    old_counter_6917 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6868)[srem64(flat_segment_id_6903, (int64_t) 20480)], (int) 1);
+                    ((__local bool *) sync_arr_mem_6897)[(int64_t) 0] = old_counter_6917 == sext_i64_i32(blocks_per_segment_6862 - (int64_t) 1);
+                }
+            }
+            barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+            is_last_block_6918 = ((__local bool *) sync_arr_mem_6897)[(int64_t) 0];
+            if (is_last_block_6918) {
+                if (local_tid_6891 == 0) {
+                    old_counter_6917 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6868)[srem64(flat_segment_id_6903, (int64_t) 20480)], (int) sext_i64_i32((int64_t) 0 - blocks_per_segment_6862));
+                }
+                // read in the per-block-results
+                {
+                    int64_t read_per_thread_6919 = sdiv_up64(blocks_per_segment_6862, segred_tblock_sizze_6663);
+                    
+                    eta_p_6670 = 0.0F;
+                    for (int64_t i_6920 = 0; i_6920 < read_per_thread_6919; i_6920++) {
+                        int64_t block_res_id_6921 = sext_i32_i64(local_tid_6891) * read_per_thread_6919 + i_6920;
+                        int64_t index_of_block_res_6922 = flat_segment_id_6903 * blocks_per_segment_6862 + block_res_id_6921;
+                        
+                        if (slt64(block_res_id_6921, blocks_per_segment_6862)) {
+                            eta_p_6671 = ((__global float *) segred_tmp_mem_6866)[index_of_block_res_6922];
+                            
+                            float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671;
+                            
+                            eta_p_6670 = defunc_0_op_res_6672;
+                        }
+                    }
+                }
+                ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6670;
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // reduce the per-block results
+                {
+                    int32_t offset_6923;
+                    int32_t skip_waves_6924 = 1;
+                    float eta_p_6908;
+                    float eta_p_6909;
+                    
+                    offset_6923 = 0;
+                    // participating threads read initial accumulator
+                    {
+                        if (slt32(local_tid_6891, sext_i64_i32(segred_tblock_sizze_6663))) {
+                            eta_p_6908 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6923)];
+                        }
+                    }
+                    offset_6923 = 1;
+                    while (slt32(offset_6923, wave_sizze_6893)) {
+                        if (slt32(local_tid_6891 + offset_6923, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) & (2 * offset_6923 - 1)) == 0) {
+                            // read array element
+                            {
+                                eta_p_6909 = ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6923)];
+                            }
+                            // apply reduction operation
+                            {
+                                float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909;
+                                
+                                eta_p_6908 = defunc_0_op_res_6910;
+                            }
+                            // write result of operation
+                            {
+                                ((volatile __local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908;
+                            }
+                        }
+                        offset_6923 *= 2;
+                    }
+                    while (slt32(skip_waves_6924, squot32(sext_i64_i32(segred_tblock_sizze_6663) + wave_sizze_6893 - 1, wave_sizze_6893))) {
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                        offset_6923 = skip_waves_6924 * wave_sizze_6893;
+                        if (slt32(local_tid_6891 + offset_6923, sext_i64_i32(segred_tblock_sizze_6663)) && ((local_tid_6891 - squot32(local_tid_6891, wave_sizze_6893) * wave_sizze_6893) == 0 && (squot32(local_tid_6891, wave_sizze_6893) & (2 * skip_waves_6924 - 1)) == 0)) {
+                            // read array element
+                            {
+                                eta_p_6909 = ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891 + offset_6923)];
+                            }
+                            // apply reduction operation
+                            {
+                                float defunc_0_op_res_6910 = eta_p_6908 + eta_p_6909;
+                                
+                                eta_p_6908 = defunc_0_op_res_6910;
+                            }
+                            // write result of operation
+                            {
+                                ((__local float *) red_arr_f32_mem_6895)[sext_i32_i64(local_tid_6891)] = eta_p_6908;
+                            }
+                        }
+                        skip_waves_6924 *= 2;
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                    // and back to memory with the final result
+                    {
+                        if (local_tid_6891 == 0) {
+                            ((__global float *) mem_6780)[gtid_6667] = eta_p_6908;
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_6:
+    return;
+    #undef segred_tblock_sizze_6663
+    #undef chunk_sizze_6833
+}
+FUTHARK_KERNEL_SIZED(chunked_entropyzisegred_small_6669_dim1, 1, 1)
+void chunked_entropyzisegred_small_6669(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t chunk_sizze_6047, int64_t dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, int64_t num_tblocks_6664, int64_t segment_sizze_nonzzero_6834, __global unsigned char *mem_6777, __global unsigned char *mem_6780)
+{
+    #define segred_tblock_sizze_6663 (chunked_entropyzisegred_small_6669zisegred_tblock_sizze_6663)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *red_arr_f32_mem_6841_backing_0 = &shared_mem[0];
+    const int64_t red_arr_f32_mem_6841_backing_0_offset = 0 + ((int64_t) 4 * segred_tblock_sizze_6663 + srem64((int64_t) 8 - srem64((int64_t) 4 * segred_tblock_sizze_6663, (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6837;
+    int32_t tblock_sizze_6840;
+    int32_t wave_sizze_6839;
+    int32_t block_id_6838;
+    int32_t global_tid_6836;
+    int64_t phys_tid_6669;
+    __local unsigned char *red_arr_f32_mem_6841;
+    int32_t phys_tblock_id_6843;
+    int32_t iterations_6844;
+    
+    local_tid_6837 = get_local_id(0);
+    tblock_sizze_6840 = get_local_size(0);
+    wave_sizze_6839 = LOCKSTEP_WIDTH;
+    block_id_6838 = get_tblock_id(0);
+    global_tid_6836 = block_id_6838 * tblock_sizze_6840 + local_tid_6837;
+    phys_tid_6669 = sext_i32_i64(global_tid_6836);
+    red_arr_f32_mem_6841 = (__local unsigned char *) red_arr_f32_mem_6841_backing_0;
+    phys_tblock_id_6843 = get_tblock_id(0);
+    iterations_6844 = sdiv_up32(sext_i64_i32(sdiv_up64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834))) - phys_tblock_id_6843, sext_i64_i32(num_tblocks_6664));
+    for (int32_t i_6845 = 0; i_6845 < iterations_6844; i_6845++) {
+        int32_t virt_tblock_id_6846;
+        int64_t slice_6847;
+        int64_t gtid_6667;
+        int64_t remnant_6848;
+        int64_t gtid_6668;
+        
+        virt_tblock_id_6846 = phys_tblock_id_6843 + i_6845 * sext_i64_i32(num_tblocks_6664);
+        slice_6847 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197;
+        gtid_6667 = squot64(sext_i32_i64(local_tid_6837), segment_sizze_nonzzero_6834) + sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834);
+        remnant_6848 = squot64(sext_i32_i64(local_tid_6837), segment_sizze_nonzzero_6834) + sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834) - gtid_6667;
+        gtid_6668 = srem64(sext_i32_i64(local_tid_6837), (int64_t) 256);
+        // apply map function if in bounds
+        {
+            if (slt64((int64_t) 0, (int64_t) 256) && (slt64(gtid_6667, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197) && slt64(sext_i32_i64(local_tid_6837), (int64_t) 256 * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834)))) {
+                // apply map function
+                {
+                    int64_t binop_y_6724 = add64((int64_t) 1, gtid_6667);
+                    int64_t binop_x_6725 = mul64(chunk_sizze_6047, binop_y_6724);
+                    int64_t binop_y_6728 = mul64(chunk_sizze_6047, gtid_6667);
+                    int64_t convop_x_6729 = sub64(binop_x_6725, binop_y_6728);
+                    float index_primexp_6730 = sitofp_i64_f32(convop_x_6729);
+                    int64_t eta_p_6675 = ((__global int64_t *) mem_6777)[gtid_6667 + gtid_6668 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197];
+                    float i64_res_6676 = sitofp_i64_f32(eta_p_6675);
+                    float lifted_lambda_res_6677 = i64_res_6676 / index_primexp_6730;
+                    bool cond_6678 = lifted_lambda_res_6677 == 0.0F;
+                    float lifted_lambda_res_6679;
+                    
+                    if (cond_6678) {
+                        lifted_lambda_res_6679 = 0.0F;
+                    } else {
+                        float log2_res_6680 = futrts_log2_32(lifted_lambda_res_6677);
+                        float lifted_lambda_res_f_res_6681 = lifted_lambda_res_6677 * log2_res_6680;
+                        
+                        lifted_lambda_res_6679 = lifted_lambda_res_f_res_6681;
+                    }
+                    // save results to be reduced
+                    {
+                        ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = lifted_lambda_res_6679;
+                    }
+                }
+            } else {
+                ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = 0.0F;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (slt64((int64_t) 0, (int64_t) 256)) {
+            // perform segmented scan to imitate reduction
+            {
+                float eta_p_6670;
+                float eta_p_6671;
+                float eta_p_6849;
+                float eta_p_6850;
+                bool ltid_in_bounds_6852 = slt64(sext_i32_i64(local_tid_6837), (int64_t) 256 * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834));
+                int32_t skip_threads_6853;
+                
+                // read input for in-block scan
+                {
+                    if (ltid_in_bounds_6852) {
+                        eta_p_6671 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)];
+                        if ((local_tid_6837 - squot32(local_tid_6837, 32) * 32) == 0) {
+                            eta_p_6670 = eta_p_6671;
+                        }
+                    }
+                }
+                // in-block scan (hopefully no barriers needed)
+                {
+                    skip_threads_6853 = 1;
+                    while (slt32(skip_threads_6853, 32)) {
+                        bool thread_active_6854 = sle32(skip_threads_6853, local_tid_6837 - squot32(local_tid_6837, 32) * 32) && ltid_in_bounds_6852;
+                        
+                        if (thread_active_6854) {
+                            // read operands
+                            {
+                                eta_p_6670 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837) - sext_i32_i64(skip_threads_6853)];
+                            }
+                        }
+                        // perform operation
+                        {
+                            bool inactive_6855 = slt64(srem64(sext_i32_i64(local_tid_6837), (int64_t) 256), sext_i32_i64(local_tid_6837) - sext_i32_i64(local_tid_6837 - skip_threads_6853));
+                            
+                            if (thread_active_6854 && inactive_6855) {
+                                eta_p_6670 = eta_p_6671;
+                            }
+                            if (thread_active_6854) {
+                                if (!inactive_6855) {
+                                    float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671;
+                                    
+                                    eta_p_6670 = defunc_0_op_res_6672;
+                                }
+                            }
+                        }
+                        if (sle32(wave_sizze_6839, skip_threads_6853)) {
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                        }
+                        if (thread_active_6854) {
+                            // write result
+                            {
+                                ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6670;
+                                eta_p_6671 = eta_p_6670;
+                            }
+                        }
+                        if (sle32(wave_sizze_6839, skip_threads_6853)) {
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                        }
+                        skip_threads_6853 *= 2;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // last thread of block 'i' writes its result to offset 'i'
+                {
+                    if ((local_tid_6837 - squot32(local_tid_6837, 32) * 32) == 31 && ltid_in_bounds_6852) {
+                        ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(squot32(local_tid_6837, 32))] = eta_p_6670;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // scan the first block, after which offset 'i' contains carry-in for block 'i+1'
+                {
+                    int32_t skip_threads_6856;
+                    
+                    // read input for in-block scan
+                    {
+                        if (squot32(local_tid_6837, 32) == 0 && ltid_in_bounds_6852) {
+                            eta_p_6850 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)];
+                            if ((local_tid_6837 - squot32(local_tid_6837, 32) * 32) == 0) {
+                                eta_p_6849 = eta_p_6850;
+                            }
+                        }
+                    }
+                    // in-block scan (hopefully no barriers needed)
+                    {
+                        skip_threads_6856 = 1;
+                        while (slt32(skip_threads_6856, 32)) {
+                            bool thread_active_6857 = sle32(skip_threads_6856, local_tid_6837 - squot32(local_tid_6837, 32) * 32) && (squot32(local_tid_6837, 32) == 0 && ltid_in_bounds_6852);
+                            
+                            if (thread_active_6857) {
+                                // read operands
+                                {
+                                    eta_p_6849 = ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837) - sext_i32_i64(skip_threads_6856)];
+                                }
+                            }
+                            // perform operation
+                            {
+                                bool inactive_6858 = slt64(srem64(sext_i32_i64(local_tid_6837 * 32 + 32 - 1), (int64_t) 256), sext_i32_i64(local_tid_6837 * 32 + 32 - 1) - sext_i32_i64((local_tid_6837 - skip_threads_6856) * 32 + 32 - 1));
+                                
+                                if (thread_active_6857 && inactive_6858) {
+                                    eta_p_6849 = eta_p_6850;
+                                }
+                                if (thread_active_6857) {
+                                    if (!inactive_6858) {
+                                        float defunc_0_op_res_6851 = eta_p_6849 + eta_p_6850;
+                                        
+                                        eta_p_6849 = defunc_0_op_res_6851;
+                                    }
+                                }
+                            }
+                            if (sle32(wave_sizze_6839, skip_threads_6856)) {
+                                barrier(CLK_LOCAL_MEM_FENCE);
+                            }
+                            if (thread_active_6857) {
+                                // write result
+                                {
+                                    ((volatile __local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6849;
+                                    eta_p_6850 = eta_p_6849;
+                                }
+                            }
+                            if (sle32(wave_sizze_6839, skip_threads_6856)) {
+                                barrier(CLK_LOCAL_MEM_FENCE);
+                            }
+                            skip_threads_6856 *= 2;
+                        }
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                
+                bool no_carry_in_6859 = squot32(local_tid_6837, 32) == 0 || !ltid_in_bounds_6852;
+                
+                // carry-in for every block except the first
+                {
+                    // read operands
+                    {
+                        if (!no_carry_in_6859) {
+                            eta_p_6671 = eta_p_6670;
+                            eta_p_6670 = ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(squot32(local_tid_6837, 32)) - (int64_t) 1];
+                        }
+                    }
+                    // perform operation
+                    {
+                        bool inactive_6860 = slt64(srem64(sext_i32_i64(local_tid_6837), (int64_t) 256), sext_i32_i64(local_tid_6837) - sext_i32_i64(squot32(local_tid_6837, 32) * 32 - 1));
+                        
+                        if (!no_carry_in_6859) {
+                            if (inactive_6860) {
+                                eta_p_6670 = eta_p_6671;
+                            }
+                        }
+                        if (!no_carry_in_6859) {
+                            if (!inactive_6860) {
+                                float defunc_0_op_res_6672 = eta_p_6670 + eta_p_6671;
+                                
+                                eta_p_6670 = defunc_0_op_res_6672;
+                            }
+                        }
+                    }
+                    // write final result
+                    {
+                        if (!no_carry_in_6859) {
+                            ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6670;
+                        }
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // restore correct values for first block
+                {
+                    if (squot32(local_tid_6837, 32) == 0 && ltid_in_bounds_6852) {
+                        ((__local float *) red_arr_f32_mem_6841)[sext_i32_i64(local_tid_6837)] = eta_p_6671;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // save final values of segments
+        {
+            if (slt64(sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834) + sext_i32_i64(local_tid_6837), dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197) && slt64(sext_i32_i64(local_tid_6837), squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834))) {
+                float tmp_6861 = ((__local float *) red_arr_f32_mem_6841)[(sext_i32_i64(local_tid_6837) + (int64_t) 1) * segment_sizze_nonzzero_6834 - (int64_t) 1];
+                
+                ((__global float *) mem_6780)[sext_i32_i64(virt_tblock_id_6846) * squot64(segred_tblock_sizze_6663, segment_sizze_nonzzero_6834) + sext_i32_i64(local_tid_6837)] = tmp_6861;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_3:
+    return;
+    #undef segred_tblock_sizze_6663
+}
+FUTHARK_KERNEL_SIZED(entropyziseghist_global_6328_dim1, 1, 1)
+void entropyziseghist_global_6328(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5907, int64_t num_tblocks_6323, int64_t num_subhistos_6815, int32_t chk_i_6885, int64_t hist_H_chk_6886, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816)
+{
+    #define seghist_tblock_sizze_6321 (entropyziseghist_global_6328ziseghist_tblock_sizze_6321)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6888;
+    int32_t tblock_sizze_6891;
+    int32_t wave_sizze_6890;
+    int32_t block_id_6889;
+    int32_t global_tid_6887;
+    int64_t phys_tid_6328;
+    int32_t subhisto_ind_6892;
+    int64_t num_chunks_6893;
+    
+    local_tid_6888 = get_local_id(0);
+    tblock_sizze_6891 = get_local_size(0);
+    wave_sizze_6890 = LOCKSTEP_WIDTH;
+    block_id_6889 = get_tblock_id(0);
+    global_tid_6887 = block_id_6889 * tblock_sizze_6891 + local_tid_6888;
+    phys_tid_6328 = sext_i32_i64(global_tid_6887);
+    subhisto_ind_6892 = squot32(global_tid_6887, sdiv_up32(sext_i64_i32(seghist_tblock_sizze_6321 * num_tblocks_6323), sext_i64_i32(num_subhistos_6815)));
+    num_chunks_6893 = sdiv_up64(n_5907, sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6321 * num_tblocks_6323)));
+    for (int64_t chunk_i_6894 = 0; chunk_i_6894 < num_chunks_6893; chunk_i_6894++) {
+        int64_t i_6895 = chunk_i_6894 * sext_i32_i64(sext_i64_i32(seghist_tblock_sizze_6321 * num_tblocks_6323)) + sext_i32_i64(global_tid_6887);
+        
+        if (slt64(i_6895, n_5907)) {
+            int64_t slice_6896;
+            int64_t gtid_6327;
+            int64_t remnant_6897;
+            
+            slice_6896 = n_5907;
+            gtid_6327 = i_6895;
+            remnant_6897 = i_6895 - gtid_6327;
+            if (slt64(i_6895, n_5907)) {
+                int8_t eta_p_6332;
+                int64_t u8_res_6334;
+                
+                eta_p_6332 = ((__global int8_t *) xs_mem_6757)[gtid_6327];
+                u8_res_6334 = zext_i8_i64(eta_p_6332);
+                // save map-out results
+                { }
+                // perform atomic updates
+                {
+                    if (sle64(sext_i32_i64(chk_i_6885) * hist_H_chk_6886, u8_res_6334) && (slt64(u8_res_6334, sext_i32_i64(chk_i_6885) * hist_H_chk_6886 + hist_H_chk_6886) && (sle64((int64_t) 0, u8_res_6334) && slt64(u8_res_6334, (int64_t) 256)))) {
+                        int64_t eta_p_6329;
+                        int64_t eta_p_6330 = (int64_t) 1;
+                        int64_t old_6898;
+                        
+                        old_6898 = atomic_add_i64_global(&((volatile __global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(subhisto_ind_6892) * (int64_t) 256 + u8_res_6334], (int64_t) eta_p_6330);
+                    }
+                }
+            }
+        }
+    }
+    
+  error_0:
+    return;
+    #undef seghist_tblock_sizze_6321
+}
+FUTHARK_KERNEL_SIZED(entropyziseghist_local_6328_dim1, 1, 1)
+void entropyziseghist_local_6328(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t n_5907, int64_t num_subhistos_6815, int64_t num_tblocks_6826, int32_t hist_M_6832, int32_t chk_i_6836, int64_t num_segments_6837, int64_t hist_H_chk_6838, int64_t histo_sizze_6839, int32_t init_per_thread_6840, __global unsigned char *xs_mem_6757, __global unsigned char *defunc_0_map_res_subhistos_mem_6816)
+{
+    #define max_tblock_sizze_6825 (entropyziseghist_local_6328zimax_tblock_sizze_6825)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *subhistogram_local_mem_6854_backing_0 = &shared_mem[0];
+    const int64_t subhistogram_local_mem_6854_backing_0_offset = 0 + ((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838) + srem64((int64_t) 8 - srem64((int64_t) 8 * (hist_M_6832 * hist_H_chk_6838), (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6842;
+    int32_t tblock_sizze_6845;
+    int32_t wave_sizze_6844;
+    int32_t block_id_6843;
+    int32_t global_tid_6841;
+    int64_t phys_tid_6328;
+    int32_t phys_tblock_id_6846;
+    int32_t iterations_6847;
+    
+    local_tid_6842 = get_local_id(0);
+    tblock_sizze_6845 = get_local_size(0);
+    wave_sizze_6844 = LOCKSTEP_WIDTH;
+    block_id_6843 = get_tblock_id(0);
+    global_tid_6841 = block_id_6843 * tblock_sizze_6845 + local_tid_6842;
+    phys_tid_6328 = sext_i32_i64(global_tid_6841);
+    phys_tblock_id_6846 = get_tblock_id(0);
+    iterations_6847 = sdiv_up32(sext_i64_i32(num_tblocks_6826 * num_segments_6837) - phys_tblock_id_6846, sext_i64_i32(num_tblocks_6826));
+    for (int32_t i_6848 = 0; i_6848 < iterations_6847; i_6848++) {
+        int32_t virt_tblock_id_6849;
+        int32_t flat_segment_id_6850;
+        int32_t gid_in_segment_6851;
+        int32_t pgtid_in_segment_6852;
+        int32_t threads_per_segment_6853;
+        __local unsigned char *subhistogram_local_mem_6854;
+        int32_t thread_local_subhisto_i_6856;
+        int64_t num_chunks_6863;
+        
+        virt_tblock_id_6849 = phys_tblock_id_6846 + i_6848 * sext_i64_i32(num_tblocks_6826);
+        flat_segment_id_6850 = squot32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826));
+        gid_in_segment_6851 = srem32(virt_tblock_id_6849, sext_i64_i32(num_tblocks_6826));
+        pgtid_in_segment_6852 = gid_in_segment_6851 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842;
+        threads_per_segment_6853 = sext_i64_i32(num_tblocks_6826 * max_tblock_sizze_6825);
+        subhistogram_local_mem_6854 = (__local unsigned char *) subhistogram_local_mem_6854_backing_0;
+        thread_local_subhisto_i_6856 = srem32(local_tid_6842, hist_M_6832);
+        // initialize histograms in shared memory
+        {
+            for (int32_t local_i_6857 = 0; local_i_6857 < init_per_thread_6840; local_i_6857++) {
+                int32_t j_6858 = local_i_6857 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842;
+                int32_t j_offset_6859 = hist_M_6832 * sext_i64_i32(histo_sizze_6839) * gid_in_segment_6851 + j_6858;
+                int32_t local_subhisto_i_6860 = squot32(j_6858, sext_i64_i32(histo_sizze_6839));
+                int32_t global_subhisto_i_6861 = squot32(j_offset_6859, sext_i64_i32(histo_sizze_6839));
+                
+                if (slt32(j_6858, hist_M_6832 * sext_i64_i32(histo_sizze_6839))) {
+                    // First subhistogram is initialised from global memory; others with neutral element.
+                    {
+                        if (global_subhisto_i_6861 == 0 && ((sle64((int64_t) 0, (int64_t) 0) && slt64((int64_t) 0, num_subhistos_6815)) && (sle64((int64_t) 0, sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838) && slt64(sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838, (int64_t) 256)))) {
+                            int64_t tmp_6862 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839))) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838];
+                            
+                            ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = tmp_6862;
+                        } else {
+                            ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(local_subhisto_i_6860) * hist_H_chk_6838 + sext_i32_i64(srem32(j_6858, sext_i64_i32(histo_sizze_6839)))] = (int64_t) 0;
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        num_chunks_6863 = sdiv_up64(n_5907, sext_i32_i64(threads_per_segment_6853));
+        for (int64_t chunk_i_6864 = 0; chunk_i_6864 < num_chunks_6863; chunk_i_6864++) {
+            int64_t i_6865 = chunk_i_6864 * sext_i32_i64(threads_per_segment_6853) + sext_i32_i64(pgtid_in_segment_6852);
+            
+            if (slt64(i_6865, n_5907)) {
+                int64_t gtid_6327;
+                int8_t eta_p_6332;
+                int64_t u8_res_6334;
+                
+                gtid_6327 = i_6865;
+                eta_p_6332 = ((__global int8_t *) xs_mem_6757)[gtid_6327];
+                u8_res_6334 = zext_i8_i64(eta_p_6332);
+                if (chk_i_6836 == 0) {
+                    // save map-out results
+                    { }
+                }
+                // perform atomic updates
+                {
+                    if ((sle64((int64_t) 0, u8_res_6334) && slt64(u8_res_6334, (int64_t) 256)) && (sle64(sext_i32_i64(chk_i_6836) * hist_H_chk_6838, u8_res_6334) && slt64(u8_res_6334, sext_i32_i64(chk_i_6836) * hist_H_chk_6838 + hist_H_chk_6838))) {
+                        int64_t eta_p_6329;
+                        int64_t eta_p_6330 = (int64_t) 1;
+                        int64_t old_6866;
+                        
+                        old_6866 = atomic_add_i64_shared(&((volatile __local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(thread_local_subhisto_i_6856) * hist_H_chk_6838 + (u8_res_6334 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838)], (int64_t) eta_p_6330);
+                    }
+                }
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+        // Compact the multiple shared memory subhistograms to result in global memory
+        {
+            int64_t trunc_H_6867 = smin64(hist_H_chk_6838, (int64_t) 256 - sext_i32_i64(chk_i_6836) * hist_H_chk_6838);
+            int32_t histo_sizze_6868 = sext_i64_i32(trunc_H_6867);
+            
+            for (int32_t local_i_6869 = 0; local_i_6869 < init_per_thread_6840; local_i_6869++) {
+                int32_t j_6870 = local_i_6869 * sext_i64_i32(max_tblock_sizze_6825) + local_tid_6842;
+                
+                if (slt32(j_6870, histo_sizze_6868)) {
+                    int64_t eta_p_6329;
+                    int64_t eta_p_6330;
+                    
+                    // Read values from subhistogram 0.
+                    {
+                        eta_p_6329 = ((__local int64_t *) subhistogram_local_mem_6854)[sext_i32_i64(j_6870)];
+                    }
+                    // Accumulate based on values in other subhistograms.
+                    {
+                        for (int32_t subhisto_id_6871 = 0; subhisto_id_6871 < hist_M_6832 - 1; subhisto_id_6871++) {
+                            eta_p_6330 = ((__local int64_t *) subhistogram_local_mem_6854)[(sext_i32_i64(subhisto_id_6871) + (int64_t) 1) * hist_H_chk_6838 + sext_i32_i64(j_6870)];
+                            
+                            int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330);
+                            
+                            eta_p_6329 = defunc_0_op_res_6331;
+                        }
+                    }
+                    // Put final bucket value in global memory.
+                    {
+                        ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[srem64(sext_i32_i64(virt_tblock_id_6849), num_tblocks_6826) * (int64_t) 256 + (sext_i32_i64(j_6870) + sext_i32_i64(chk_i_6836) * hist_H_chk_6838)] = eta_p_6329;
+                    }
+                }
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_2:
+    return;
+    #undef max_tblock_sizze_6825
+}
+FUTHARK_KERNEL_SIZED(entropyzisegred_large_6901_dim1, 1, 1)
+void entropyzisegred_large_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6323, int64_t num_subhistos_6815, int64_t blocks_per_segment_6932, int64_t q_6933, int64_t num_virtblocks_6934, int64_t threads_per_segment_6935, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816, __global unsigned char *segred_tmp_mem_6936, __global unsigned char *counters_mem_6938)
+{
+    #define seghist_tblock_sizze_6321 (entropyzisegred_large_6901ziseghist_tblock_sizze_6321)
+    #define chunk_sizze_6902 (entropyzisegred_large_6901zichunk_sizze_6902)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *sync_arr_mem_6967_backing_1 = &shared_mem[0];
+    const int64_t sync_arr_mem_6967_backing_1_offset = 0 + 8;
+    volatile __local unsigned char *red_arr_i64_mem_6965_backing_0 = &shared_mem[sync_arr_mem_6967_backing_1_offset];
+    const int64_t red_arr_i64_mem_6965_backing_0_offset = sync_arr_mem_6967_backing_1_offset + ((int64_t) 8 * seghist_tblock_sizze_6321 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6321, (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6961;
+    int32_t tblock_sizze_6964;
+    int32_t wave_sizze_6963;
+    int32_t block_id_6962;
+    int32_t global_tid_6960;
+    int64_t flat_gtid_6901;
+    __local unsigned char *red_arr_i64_mem_6965;
+    __local unsigned char *sync_arr_mem_6967;
+    int32_t phys_tblock_id_6969;
+    int32_t iterations_6970;
+    
+    local_tid_6961 = get_local_id(0);
+    tblock_sizze_6964 = get_local_size(0);
+    wave_sizze_6963 = LOCKSTEP_WIDTH;
+    block_id_6962 = get_tblock_id(0);
+    global_tid_6960 = block_id_6962 * tblock_sizze_6964 + local_tid_6961;
+    flat_gtid_6901 = sext_i32_i64(global_tid_6960);
+    red_arr_i64_mem_6965 = (__local unsigned char *) red_arr_i64_mem_6965_backing_0;
+    sync_arr_mem_6967 = (__local unsigned char *) sync_arr_mem_6967_backing_1;
+    phys_tblock_id_6969 = get_tblock_id(0);
+    iterations_6970 = sdiv_up32(sext_i64_i32(num_virtblocks_6934) - phys_tblock_id_6969, sext_i64_i32(num_tblocks_6323));
+    for (int32_t i_6971 = 0; i_6971 < iterations_6970; i_6971++) {
+        int32_t virt_tblock_id_6972;
+        int64_t flat_segment_id_6973;
+        int64_t global_tid_6974;
+        int64_t slice_6975;
+        int64_t bucket_id_6899;
+        int64_t remnant_6976;
+        int64_t subhistogram_id_6900;
+        int64_t eta_p_block_res_acc_6977;
+        int64_t eta_p_6329;
+        int64_t eta_p_6330;
+        int64_t tblock_id_in_segment_6981;
+        int64_t block_base_offset_6982;
+        int32_t offset_6985;
+        int32_t skip_waves_6986;
+        int64_t eta_p_6978;
+        int64_t eta_p_6979;
+        
+        virt_tblock_id_6972 = phys_tblock_id_6969 + i_6971 * sext_i64_i32(num_tblocks_6323);
+        flat_segment_id_6973 = squot64(sext_i32_i64(virt_tblock_id_6972), blocks_per_segment_6932);
+        global_tid_6974 = srem64(sext_i32_i64(virt_tblock_id_6972) * seghist_tblock_sizze_6321 + sext_i32_i64(local_tid_6961), threads_per_segment_6935);
+        slice_6975 = (int64_t) 256;
+        bucket_id_6899 = flat_segment_id_6973;
+        remnant_6976 = flat_segment_id_6973 - bucket_id_6899;
+        // ne-initialise the outer (per-block) accumulator(s)
+        {
+            eta_p_block_res_acc_6977 = (int64_t) 0;
+        }
+        tblock_id_in_segment_6981 = squot64(global_tid_6974, seghist_tblock_sizze_6321);
+        block_base_offset_6982 = tblock_id_in_segment_6981 * q_6933 * seghist_tblock_sizze_6321;
+        for (int64_t i_6983 = 0; i_6983 < q_6933; i_6983++) {
+            int64_t block_offset_6984 = block_base_offset_6982 + i_6983 * seghist_tblock_sizze_6321;
+            
+            subhistogram_id_6900 = global_tid_6974 + threads_per_segment_6935 * i_6983;
+            if (slt64(subhistogram_id_6900, num_subhistos_6815)) {
+                // apply map function(s)
+                {
+                    // load accumulator(s)
+                    {
+                        eta_p_6329 = eta_p_block_res_acc_6977;
+                    }
+                    // load next value(s)
+                    {
+                        eta_p_6330 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899];
+                    }
+                    // apply reduction operator(s)
+                    {
+                        int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330);
+                        
+                        // store in accumulator(s)
+                        {
+                            eta_p_block_res_acc_6977 = defunc_0_op_res_6331;
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // store accs. prims go in lmem; non-prims in params (in global mem)
+        {
+            ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_block_res_acc_6977;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        skip_waves_6986 = 1;
+        offset_6985 = 0;
+        // participating threads read initial accumulator
+        {
+            if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6321))) {
+                eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)];
+            }
+        }
+        offset_6985 = 1;
+        while (slt32(offset_6985, wave_sizze_6963)) {
+            if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6985 - 1)) == 0) {
+                // read array element
+                {
+                    eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)];
+                }
+                // apply reduction operation
+                {
+                    int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                    
+                    eta_p_6978 = defunc_0_op_res_6980;
+                }
+                // write result of operation
+                {
+                    ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                }
+            }
+            offset_6985 *= 2;
+        }
+        while (slt32(skip_waves_6986, squot32(sext_i64_i32(seghist_tblock_sizze_6321) + wave_sizze_6963 - 1, wave_sizze_6963))) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset_6985 = skip_waves_6986 * wave_sizze_6963;
+            if (slt32(local_tid_6961 + offset_6985, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6986 - 1)) == 0)) {
+                // read array element
+                {
+                    eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6985)];
+                }
+                // apply reduction operation
+                {
+                    int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                    
+                    eta_p_6978 = defunc_0_op_res_6980;
+                }
+                // write result of operation
+                {
+                    ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                }
+            }
+            skip_waves_6986 *= 2;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // thread 0 updates per-block acc(s); rest reset to ne
+        {
+            if (sext_i32_i64(local_tid_6961) == (int64_t) 0) {
+                eta_p_block_res_acc_6977 = eta_p_6978;
+            } else {
+                eta_p_block_res_acc_6977 = (int64_t) 0;
+            }
+        }
+        if (blocks_per_segment_6932 == (int64_t) 1) {
+            // first thread in block saves final result to memory
+            {
+                if (local_tid_6961 == 0) {
+                    ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_block_res_acc_6977;
+                }
+            }
+        } else {
+            int32_t old_counter_6987;
+            bool is_last_block_6988;
+            
+            // first thread in block saves block result to global memory
+            {
+                if (local_tid_6961 == 0) {
+                    ((__global int64_t *) segred_tmp_mem_6936)[sext_i32_i64(virt_tblock_id_6972)] = eta_p_block_res_acc_6977;
+                    mem_fence_global();
+                    old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) 1);
+                    ((__local bool *) sync_arr_mem_6967)[(int64_t) 0] = old_counter_6987 == sext_i64_i32(blocks_per_segment_6932 - (int64_t) 1);
+                }
+            }
+            barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+            is_last_block_6988 = ((__local bool *) sync_arr_mem_6967)[(int64_t) 0];
+            if (is_last_block_6988) {
+                if (local_tid_6961 == 0) {
+                    old_counter_6987 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6938)[srem64(flat_segment_id_6973, (int64_t) 20480)], (int) sext_i64_i32((int64_t) 0 - blocks_per_segment_6932));
+                }
+                // read in the per-block-results
+                {
+                    int64_t read_per_thread_6989 = sdiv_up64(blocks_per_segment_6932, seghist_tblock_sizze_6321);
+                    
+                    eta_p_6329 = (int64_t) 0;
+                    for (int64_t i_6990 = 0; i_6990 < read_per_thread_6989; i_6990++) {
+                        int64_t block_res_id_6991 = sext_i32_i64(local_tid_6961) * read_per_thread_6989 + i_6990;
+                        int64_t index_of_block_res_6992 = flat_segment_id_6973 * blocks_per_segment_6932 + block_res_id_6991;
+                        
+                        if (slt64(block_res_id_6991, blocks_per_segment_6932)) {
+                            eta_p_6330 = ((__global int64_t *) segred_tmp_mem_6936)[index_of_block_res_6992];
+                            
+                            int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330);
+                            
+                            eta_p_6329 = defunc_0_op_res_6331;
+                        }
+                    }
+                }
+                ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6329;
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // reduce the per-block results
+                {
+                    int32_t offset_6993;
+                    int32_t skip_waves_6994 = 1;
+                    int64_t eta_p_6978;
+                    int64_t eta_p_6979;
+                    
+                    offset_6993 = 0;
+                    // participating threads read initial accumulator
+                    {
+                        if (slt32(local_tid_6961, sext_i64_i32(seghist_tblock_sizze_6321))) {
+                            eta_p_6978 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)];
+                        }
+                    }
+                    offset_6993 = 1;
+                    while (slt32(offset_6993, wave_sizze_6963)) {
+                        if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) & (2 * offset_6993 - 1)) == 0) {
+                            // read array element
+                            {
+                                eta_p_6979 = ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)];
+                            }
+                            // apply reduction operation
+                            {
+                                int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                                
+                                eta_p_6978 = defunc_0_op_res_6980;
+                            }
+                            // write result of operation
+                            {
+                                ((volatile __local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                            }
+                        }
+                        offset_6993 *= 2;
+                    }
+                    while (slt32(skip_waves_6994, squot32(sext_i64_i32(seghist_tblock_sizze_6321) + wave_sizze_6963 - 1, wave_sizze_6963))) {
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                        offset_6993 = skip_waves_6994 * wave_sizze_6963;
+                        if (slt32(local_tid_6961 + offset_6993, sext_i64_i32(seghist_tblock_sizze_6321)) && ((local_tid_6961 - squot32(local_tid_6961, wave_sizze_6963) * wave_sizze_6963) == 0 && (squot32(local_tid_6961, wave_sizze_6963) & (2 * skip_waves_6994 - 1)) == 0)) {
+                            // read array element
+                            {
+                                eta_p_6979 = ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961 + offset_6993)];
+                            }
+                            // apply reduction operation
+                            {
+                                int64_t defunc_0_op_res_6980 = add64(eta_p_6978, eta_p_6979);
+                                
+                                eta_p_6978 = defunc_0_op_res_6980;
+                            }
+                            // write result of operation
+                            {
+                                ((__local int64_t *) red_arr_i64_mem_6965)[sext_i32_i64(local_tid_6961)] = eta_p_6978;
+                            }
+                        }
+                        skip_waves_6994 *= 2;
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                    // and back to memory with the final result
+                    {
+                        if (local_tid_6961 == 0) {
+                            ((__global int64_t *) mem_6758)[bucket_id_6899] = eta_p_6978;
+                        }
+                    }
+                }
+            }
+        }
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_6:
+    return;
+    #undef seghist_tblock_sizze_6321
+    #undef chunk_sizze_6902
+}
+FUTHARK_KERNEL_SIZED(entropyzisegred_nonseg_6344_dim1, 1, 1)
+void entropyzisegred_nonseg_6344(__local uint64_t *shared_mem_aligned, __global int *global_failure, float i64_res_6254, int64_t num_tblocks_6339, int64_t num_threads_7001, __global unsigned char *mem_6758, __global unsigned char *mem_6761, __global unsigned char *counters_mem_6997, __global unsigned char *segred_tmp_mem_6999)
+{
+    #define segred_tblock_sizze_6337 (entropyzisegred_nonseg_6344zisegred_tblock_sizze_6337)
+    #define chunk_sizze_6996 (entropyzisegred_nonseg_6344zichunk_sizze_6996)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *sync_arr_mem_7009_backing_1 = &shared_mem[0];
+    const int64_t sync_arr_mem_7009_backing_1_offset = 0 + 8;
+    volatile __local unsigned char *red_arr_f32_mem_7007_backing_0 = &shared_mem[sync_arr_mem_7009_backing_1_offset];
+    const int64_t red_arr_f32_mem_7007_backing_0_offset = sync_arr_mem_7009_backing_1_offset + ((int64_t) 4 * segred_tblock_sizze_6337 + srem64((int64_t) 8 - srem64((int64_t) 4 * segred_tblock_sizze_6337, (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_7003;
+    int32_t tblock_sizze_7006;
+    int32_t wave_sizze_7005;
+    int32_t block_id_7004;
+    int32_t global_tid_7002;
+    int64_t phys_tid_6344;
+    __local unsigned char *red_arr_f32_mem_7007;
+    __local unsigned char *sync_arr_mem_7009;
+    int64_t dummy_6342;
+    int64_t gtid_6343;
+    int64_t q_7011;
+    float eta_p_block_res_acc_7012;
+    float eta_p_6266;
+    float eta_p_6267;
+    int64_t tblock_id_in_segment_7016;
+    int64_t block_base_offset_7017;
+    int32_t offset_7020;
+    int32_t skip_waves_7021;
+    float eta_p_7013;
+    float eta_p_7014;
+    int32_t old_counter_7022;
+    bool is_last_block_7023;
+    
+    local_tid_7003 = get_local_id(0);
+    tblock_sizze_7006 = get_local_size(0);
+    wave_sizze_7005 = LOCKSTEP_WIDTH;
+    block_id_7004 = get_tblock_id(0);
+    global_tid_7002 = block_id_7004 * tblock_sizze_7006 + local_tid_7003;
+    phys_tid_6344 = sext_i32_i64(global_tid_7002);
+    red_arr_f32_mem_7007 = (__local unsigned char *) red_arr_f32_mem_7007_backing_0;
+    sync_arr_mem_7009 = (__local unsigned char *) sync_arr_mem_7009_backing_1;
+    dummy_6342 = (int64_t) 0;
+    gtid_6343 = (int64_t) 0;
+    q_7011 = sdiv_up64((int64_t) 256, sext_i32_i64(sext_i64_i32(segred_tblock_sizze_6337 * num_tblocks_6339)) * chunk_sizze_6996);
+    // ne-initialise the outer (per-block) accumulator(s)
+    {
+        eta_p_block_res_acc_7012 = 0.0F;
+    }
+    tblock_id_in_segment_7016 = squot64(phys_tid_6344, segred_tblock_sizze_6337);
+    block_base_offset_7017 = tblock_id_in_segment_7016 * q_7011 * segred_tblock_sizze_6337;
+    for (int64_t i_7018 = 0; i_7018 < q_7011; i_7018++) {
+        int64_t block_offset_7019 = block_base_offset_7017 + i_7018 * segred_tblock_sizze_6337;
+        
+        gtid_6343 = phys_tid_6344 + num_threads_7001 * i_7018;
+        if (slt64(gtid_6343, (int64_t) 256)) {
+            // apply map function(s)
+            {
+                // apply map function
+                {
+                    int64_t eta_p_6282 = ((__global int64_t *) mem_6758)[gtid_6343];
+                    float i64_res_6283 = sitofp_i64_f32(eta_p_6282);
+                    float lifted_lambda_res_6284 = i64_res_6283 / i64_res_6254;
+                    bool cond_6286 = lifted_lambda_res_6284 == 0.0F;
+                    float lifted_lambda_res_6287;
+                    
+                    if (cond_6286) {
+                        lifted_lambda_res_6287 = 0.0F;
+                    } else {
+                        float log2_res_6288 = futrts_log2_32(lifted_lambda_res_6284);
+                        float lifted_lambda_res_f_res_6289 = lifted_lambda_res_6284 * log2_res_6288;
+                        
+                        lifted_lambda_res_6287 = lifted_lambda_res_f_res_6289;
+                    }
+                    // load accumulator(s)
+                    {
+                        eta_p_6266 = eta_p_block_res_acc_7012;
+                    }
+                    // load next value(s)
+                    {
+                        eta_p_6267 = lifted_lambda_res_6287;
+                    }
+                    // apply reduction operator(s)
+                    {
+                        float defunc_0_op_res_6268 = eta_p_6266 + eta_p_6267;
+                        
+                        // store in accumulator(s)
+                        {
+                            eta_p_block_res_acc_7012 = defunc_0_op_res_6268;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // store accs. prims go in lmem; non-prims in params (in global mem)
+    {
+        ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_block_res_acc_7012;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    skip_waves_7021 = 1;
+    offset_7020 = 0;
+    // participating threads read initial accumulator
+    {
+        if (slt32(local_tid_7003, sext_i64_i32(segred_tblock_sizze_6337))) {
+            eta_p_7013 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7020)];
+        }
+    }
+    offset_7020 = 1;
+    while (slt32(offset_7020, wave_sizze_7005)) {
+        if (slt32(local_tid_7003 + offset_7020, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) & (2 * offset_7020 - 1)) == 0) {
+            // read array element
+            {
+                eta_p_7014 = ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7020)];
+            }
+            // apply reduction operation
+            {
+                float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014;
+                
+                eta_p_7013 = defunc_0_op_res_7015;
+            }
+            // write result of operation
+            {
+                ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013;
+            }
+        }
+        offset_7020 *= 2;
+    }
+    while (slt32(skip_waves_7021, squot32(sext_i64_i32(segred_tblock_sizze_6337) + wave_sizze_7005 - 1, wave_sizze_7005))) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        offset_7020 = skip_waves_7021 * wave_sizze_7005;
+        if (slt32(local_tid_7003 + offset_7020, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) == 0 && (squot32(local_tid_7003, wave_sizze_7005) & (2 * skip_waves_7021 - 1)) == 0)) {
+            // read array element
+            {
+                eta_p_7014 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7020)];
+            }
+            // apply reduction operation
+            {
+                float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014;
+                
+                eta_p_7013 = defunc_0_op_res_7015;
+            }
+            // write result of operation
+            {
+                ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013;
+            }
+        }
+        skip_waves_7021 *= 2;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // thread 0 updates per-block acc(s); rest reset to ne
+    {
+        if (sext_i32_i64(local_tid_7003) == (int64_t) 0) {
+            eta_p_block_res_acc_7012 = eta_p_7013;
+        } else {
+            eta_p_block_res_acc_7012 = 0.0F;
+        }
+    }
+    // first thread in block saves block result to global memory
+    {
+        if (local_tid_7003 == 0) {
+            ((__global float *) segred_tmp_mem_6999)[sext_i32_i64(block_id_7004)] = eta_p_block_res_acc_7012;
+            mem_fence_global();
+            old_counter_7022 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6997)[(int64_t) 0], (int) 1);
+            ((__local bool *) sync_arr_mem_7009)[(int64_t) 0] = old_counter_7022 == sext_i64_i32(num_tblocks_6339 - (int64_t) 1);
+        }
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    is_last_block_7023 = ((__local bool *) sync_arr_mem_7009)[(int64_t) 0];
+    if (is_last_block_7023) {
+        if (local_tid_7003 == 0) {
+            old_counter_7022 = atomic_add_i32_global(&((volatile __global int *) counters_mem_6997)[(int64_t) 0], (int) sext_i64_i32((int64_t) 0 - num_tblocks_6339));
+        }
+        // read in the per-block-results
+        {
+            int64_t read_per_thread_7024 = sdiv_up64(num_tblocks_6339, segred_tblock_sizze_6337);
+            
+            eta_p_6266 = 0.0F;
+            for (int64_t i_7025 = 0; i_7025 < read_per_thread_7024; i_7025++) {
+                int64_t block_res_id_7026 = sext_i32_i64(local_tid_7003) * read_per_thread_7024 + i_7025;
+                int64_t index_of_block_res_7027 = block_res_id_7026;
+                
+                if (slt64(block_res_id_7026, num_tblocks_6339)) {
+                    eta_p_6267 = ((__global float *) segred_tmp_mem_6999)[index_of_block_res_7027];
+                    
+                    float defunc_0_op_res_6268 = eta_p_6266 + eta_p_6267;
+                    
+                    eta_p_6266 = defunc_0_op_res_6268;
+                }
+            }
+        }
+        ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_6266;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // reduce the per-block results
+        {
+            int32_t offset_7028;
+            int32_t skip_waves_7029 = 1;
+            float eta_p_7013;
+            float eta_p_7014;
+            
+            offset_7028 = 0;
+            // participating threads read initial accumulator
+            {
+                if (slt32(local_tid_7003, sext_i64_i32(segred_tblock_sizze_6337))) {
+                    eta_p_7013 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7028)];
+                }
+            }
+            offset_7028 = 1;
+            while (slt32(offset_7028, wave_sizze_7005)) {
+                if (slt32(local_tid_7003 + offset_7028, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) & (2 * offset_7028 - 1)) == 0) {
+                    // read array element
+                    {
+                        eta_p_7014 = ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7028)];
+                    }
+                    // apply reduction operation
+                    {
+                        float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014;
+                        
+                        eta_p_7013 = defunc_0_op_res_7015;
+                    }
+                    // write result of operation
+                    {
+                        ((volatile __local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013;
+                    }
+                }
+                offset_7028 *= 2;
+            }
+            while (slt32(skip_waves_7029, squot32(sext_i64_i32(segred_tblock_sizze_6337) + wave_sizze_7005 - 1, wave_sizze_7005))) {
+                barrier(CLK_LOCAL_MEM_FENCE);
+                offset_7028 = skip_waves_7029 * wave_sizze_7005;
+                if (slt32(local_tid_7003 + offset_7028, sext_i64_i32(segred_tblock_sizze_6337)) && ((local_tid_7003 - squot32(local_tid_7003, wave_sizze_7005) * wave_sizze_7005) == 0 && (squot32(local_tid_7003, wave_sizze_7005) & (2 * skip_waves_7029 - 1)) == 0)) {
+                    // read array element
+                    {
+                        eta_p_7014 = ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003 + offset_7028)];
+                    }
+                    // apply reduction operation
+                    {
+                        float defunc_0_op_res_7015 = eta_p_7013 + eta_p_7014;
+                        
+                        eta_p_7013 = defunc_0_op_res_7015;
+                    }
+                    // write result of operation
+                    {
+                        ((__local float *) red_arr_f32_mem_7007)[sext_i32_i64(local_tid_7003)] = eta_p_7013;
+                    }
+                }
+                skip_waves_7029 *= 2;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            // and back to memory with the final result
+            {
+                if (local_tid_7003 == 0) {
+                    ((__global float *) mem_6761)[(int64_t) 0] = eta_p_7013;
+                }
+            }
+        }
+    }
+    
+  error_5:
+    return;
+    #undef segred_tblock_sizze_6337
+    #undef chunk_sizze_6996
+}
+FUTHARK_KERNEL_SIZED(entropyzisegred_small_6901_dim1, 1, 1)
+void entropyzisegred_small_6901(__local uint64_t *shared_mem_aligned, __global int *global_failure, int64_t num_tblocks_6323, int64_t num_subhistos_6815, int64_t segment_sizze_nonzzero_6903, __global unsigned char *mem_6758, __global unsigned char *defunc_0_map_res_subhistos_mem_6816)
+{
+    #define seghist_tblock_sizze_6321 (entropyzisegred_small_6901ziseghist_tblock_sizze_6321)
+    
+    __local unsigned char *shared_mem = (__local unsigned char *) shared_mem_aligned;
+    volatile __local unsigned char *red_arr_i64_mem_6910_backing_0 = &shared_mem[0];
+    const int64_t red_arr_i64_mem_6910_backing_0_offset = 0 + ((int64_t) 8 * seghist_tblock_sizze_6321 + srem64((int64_t) 8 - srem64((int64_t) 8 * seghist_tblock_sizze_6321, (int64_t) 8), (int64_t) 8));
+    
+    if (*global_failure >= 0)
+        return;
+    
+    int32_t local_tid_6906;
+    int32_t tblock_sizze_6909;
+    int32_t wave_sizze_6908;
+    int32_t block_id_6907;
+    int32_t global_tid_6905;
+    int64_t flat_gtid_6901;
+    __local unsigned char *red_arr_i64_mem_6910;
+    int32_t phys_tblock_id_6912;
+    int32_t iterations_6913;
+    
+    local_tid_6906 = get_local_id(0);
+    tblock_sizze_6909 = get_local_size(0);
+    wave_sizze_6908 = LOCKSTEP_WIDTH;
+    block_id_6907 = get_tblock_id(0);
+    global_tid_6905 = block_id_6907 * tblock_sizze_6909 + local_tid_6906;
+    flat_gtid_6901 = sext_i32_i64(global_tid_6905);
+    red_arr_i64_mem_6910 = (__local unsigned char *) red_arr_i64_mem_6910_backing_0;
+    phys_tblock_id_6912 = get_tblock_id(0);
+    iterations_6913 = sdiv_up32(sext_i64_i32(sdiv_up64((int64_t) 256, squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903))) - phys_tblock_id_6912, sext_i64_i32(num_tblocks_6323));
+    for (int32_t i_6914 = 0; i_6914 < iterations_6913; i_6914++) {
+        int32_t virt_tblock_id_6915;
+        int64_t slice_6916;
+        int64_t bucket_id_6899;
+        int64_t remnant_6917;
+        int64_t subhistogram_id_6900;
+        
+        virt_tblock_id_6915 = phys_tblock_id_6912 + i_6914 * sext_i64_i32(num_tblocks_6323);
+        slice_6916 = (int64_t) 256;
+        bucket_id_6899 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903);
+        remnant_6917 = squot64(sext_i32_i64(local_tid_6906), segment_sizze_nonzzero_6903) + sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903) - bucket_id_6899;
+        subhistogram_id_6900 = srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815);
+        // apply map function if in bounds
+        {
+            if (slt64((int64_t) 0, num_subhistos_6815) && (slt64(bucket_id_6899, (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903)))) {
+                // save results to be reduced
+                {
+                    int64_t tmp_6918 = ((__global int64_t *) defunc_0_map_res_subhistos_mem_6816)[subhistogram_id_6900 * (int64_t) 256 + bucket_id_6899];
+                    
+                    ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = tmp_6918;
+                }
+            } else {
+                ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = (int64_t) 0;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (slt64((int64_t) 0, num_subhistos_6815)) {
+            // perform segmented scan to imitate reduction
+            {
+                int64_t eta_p_6329;
+                int64_t eta_p_6330;
+                int64_t eta_p_6919;
+                int64_t eta_p_6920;
+                bool ltid_in_bounds_6922 = slt64(sext_i32_i64(local_tid_6906), num_subhistos_6815 * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903));
+                int32_t skip_threads_6923;
+                
+                // read input for in-block scan
+                {
+                    if (ltid_in_bounds_6922) {
+                        eta_p_6330 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)];
+                        if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) {
+                            eta_p_6329 = eta_p_6330;
+                        }
+                    }
+                }
+                // in-block scan (hopefully no barriers needed)
+                {
+                    skip_threads_6923 = 1;
+                    while (slt32(skip_threads_6923, 32)) {
+                        bool thread_active_6924 = sle32(skip_threads_6923, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && ltid_in_bounds_6922;
+                        
+                        if (thread_active_6924) {
+                            // read operands
+                            {
+                                eta_p_6329 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6923)];
+                            }
+                        }
+                        // perform operation
+                        {
+                            bool inactive_6925 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(local_tid_6906 - skip_threads_6923));
+                            
+                            if (thread_active_6924 && inactive_6925) {
+                                eta_p_6329 = eta_p_6330;
+                            }
+                            if (thread_active_6924) {
+                                if (!inactive_6925) {
+                                    int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330);
+                                    
+                                    eta_p_6329 = defunc_0_op_res_6331;
+                                }
+                            }
+                        }
+                        if (sle32(wave_sizze_6908, skip_threads_6923)) {
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                        }
+                        if (thread_active_6924) {
+                            // write result
+                            {
+                                ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6329;
+                                eta_p_6330 = eta_p_6329;
+                            }
+                        }
+                        if (sle32(wave_sizze_6908, skip_threads_6923)) {
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                        }
+                        skip_threads_6923 *= 2;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // last thread of block 'i' writes its result to offset 'i'
+                {
+                    if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 31 && ltid_in_bounds_6922) {
+                        ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32))] = eta_p_6329;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // scan the first block, after which offset 'i' contains carry-in for block 'i+1'
+                {
+                    int32_t skip_threads_6926;
+                    
+                    // read input for in-block scan
+                    {
+                        if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) {
+                            eta_p_6920 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)];
+                            if ((local_tid_6906 - squot32(local_tid_6906, 32) * 32) == 0) {
+                                eta_p_6919 = eta_p_6920;
+                            }
+                        }
+                    }
+                    // in-block scan (hopefully no barriers needed)
+                    {
+                        skip_threads_6926 = 1;
+                        while (slt32(skip_threads_6926, 32)) {
+                            bool thread_active_6927 = sle32(skip_threads_6926, local_tid_6906 - squot32(local_tid_6906, 32) * 32) && (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922);
+                            
+                            if (thread_active_6927) {
+                                // read operands
+                                {
+                                    eta_p_6919 = ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906) - sext_i32_i64(skip_threads_6926)];
+                                }
+                            }
+                            // perform operation
+                            {
+                                bool inactive_6928 = slt64(srem64(sext_i32_i64(local_tid_6906 * 32 + 32 - 1), num_subhistos_6815), sext_i32_i64(local_tid_6906 * 32 + 32 - 1) - sext_i32_i64((local_tid_6906 - skip_threads_6926) * 32 + 32 - 1));
+                                
+                                if (thread_active_6927 && inactive_6928) {
+                                    eta_p_6919 = eta_p_6920;
+                                }
+                                if (thread_active_6927) {
+                                    if (!inactive_6928) {
+                                        int64_t defunc_0_op_res_6921 = add64(eta_p_6919, eta_p_6920);
+                                        
+                                        eta_p_6919 = defunc_0_op_res_6921;
+                                    }
+                                }
+                            }
+                            if (sle32(wave_sizze_6908, skip_threads_6926)) {
+                                barrier(CLK_LOCAL_MEM_FENCE);
+                            }
+                            if (thread_active_6927) {
+                                // write result
+                                {
+                                    ((volatile __local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6919;
+                                    eta_p_6920 = eta_p_6919;
+                                }
+                            }
+                            if (sle32(wave_sizze_6908, skip_threads_6926)) {
+                                barrier(CLK_LOCAL_MEM_FENCE);
+                            }
+                            skip_threads_6926 *= 2;
+                        }
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                
+                bool no_carry_in_6929 = squot32(local_tid_6906, 32) == 0 || !ltid_in_bounds_6922;
+                
+                // carry-in for every block except the first
+                {
+                    // read operands
+                    {
+                        if (!no_carry_in_6929) {
+                            eta_p_6330 = eta_p_6329;
+                            eta_p_6329 = ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(squot32(local_tid_6906, 32)) - (int64_t) 1];
+                        }
+                    }
+                    // perform operation
+                    {
+                        bool inactive_6930 = slt64(srem64(sext_i32_i64(local_tid_6906), num_subhistos_6815), sext_i32_i64(local_tid_6906) - sext_i32_i64(squot32(local_tid_6906, 32) * 32 - 1));
+                        
+                        if (!no_carry_in_6929) {
+                            if (inactive_6930) {
+                                eta_p_6329 = eta_p_6330;
+                            }
+                        }
+                        if (!no_carry_in_6929) {
+                            if (!inactive_6930) {
+                                int64_t defunc_0_op_res_6331 = add64(eta_p_6329, eta_p_6330);
+                                
+                                eta_p_6329 = defunc_0_op_res_6331;
+                            }
+                        }
+                    }
+                    // write final result
+                    {
+                        if (!no_carry_in_6929) {
+                            ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6329;
+                        }
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                // restore correct values for first block
+                {
+                    if (squot32(local_tid_6906, 32) == 0 && ltid_in_bounds_6922) {
+                        ((__local int64_t *) red_arr_i64_mem_6910)[sext_i32_i64(local_tid_6906)] = eta_p_6330;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // save final values of segments
+        {
+            if (slt64(sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906), (int64_t) 256) && slt64(sext_i32_i64(local_tid_6906), squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903))) {
+                int64_t tmp_6931 = ((__local int64_t *) red_arr_i64_mem_6910)[(sext_i32_i64(local_tid_6906) + (int64_t) 1) * segment_sizze_nonzzero_6903 - (int64_t) 1];
+                
+                ((__global int64_t *) mem_6758)[sext_i32_i64(virt_tblock_id_6915) * squot64(seghist_tblock_sizze_6321, segment_sizze_nonzzero_6903) + sext_i32_i64(local_tid_6906)] = tmp_6931;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
+    }
+    
+  error_3:
+    return;
+    #undef seghist_tblock_sizze_6321
+}
+"""
+# Start of values.py.
+
+# Hacky parser/reader/writer for values written in Futhark syntax.
+# Used for reading stdin when compiling standalone programs with the
+# Python code generator.
+
+import numpy as np
+import struct
+import sys
+
+
+class ReaderInput:
+    def __init__(self, f):
+        self.f = f
+        self.lookahead_buffer = []
+
+    def get_char(self):
+        if len(self.lookahead_buffer) == 0:
+            return self.f.read(1)
+        else:
+            c = self.lookahead_buffer[0]
+            self.lookahead_buffer = self.lookahead_buffer[1:]
+            return c
+
+    def unget_char(self, c):
+        self.lookahead_buffer = [c] + self.lookahead_buffer
+
+    def get_chars(self, n):
+        n1 = min(n, len(self.lookahead_buffer))
+        s = b"".join(self.lookahead_buffer[:n1])
+        self.lookahead_buffer = self.lookahead_buffer[n1:]
+        n2 = n - n1
+        if n2 > 0:
+            s += self.f.read(n2)
+        return s
+
+    def peek_char(self):
+        c = self.get_char()
+        if c:
+            self.unget_char(c)
+        return c
+
+
+def skip_spaces(f):
+    c = f.get_char()
+    while c != None:
+        if c.isspace():
+            c = f.get_char()
+        elif c == b"-":
+            # May be line comment.
+            if f.peek_char() == b"-":
+                # Yes, line comment. Skip to end of line.
+                while c != b"\n" and c != None:
+                    c = f.get_char()
+            else:
+                break
+        else:
+            break
+    if c:
+        f.unget_char(c)
+
+
+def parse_specific_char(f, expected):
+    got = f.get_char()
+    if got != expected:
+        f.unget_char(got)
+        raise ValueError
+    return True
+
+
+def parse_specific_string(f, s):
+    # This funky mess is intended, and is caused by the fact that if `type(b) ==
+    # bytes` then `type(b[0]) == int`, but we need to match each element with a
+    # `bytes`, so therefore we make each character an array element
+    b = s.encode("utf8")
+    bs = [b[i : i + 1] for i in range(len(b))]
+    read = []
+    try:
+        for c in bs:
+            parse_specific_char(f, c)
+            read.append(c)
+        return True
+    except ValueError:
+        for c in read[::-1]:
+            f.unget_char(c)
+        raise
+
+
+def optional(p, *args):
+    try:
+        return p(*args)
+    except ValueError:
+        return None
+
+
+def optional_specific_string(f, s):
+    c = f.peek_char()
+    # This funky mess is intended, and is caused by the fact that if `type(b) ==
+    # bytes` then `type(b[0]) == int`, but we need to match each element with a
+    # `bytes`, so therefore we make each character an array element
+    b = s.encode("utf8")
+    bs = [b[i : i + 1] for i in range(len(b))]
+    if c == bs[0]:
+        return parse_specific_string(f, s)
+    else:
+        return False
+
+
+def sepBy(p, sep, *args):
+    elems = []
+    x = optional(p, *args)
+    if x != None:
+        elems += [x]
+        while optional(sep, *args) != None:
+            x = p(*args)
+            elems += [x]
+    return elems
+
+
+# Assumes '0x' has already been read
+def parse_hex_int(f):
+    s = b""
+    c = f.get_char()
+    while c != None:
+        if c in b"01234556789ABCDEFabcdef":
+            s += c
+            c = f.get_char()
+        elif c == b"_":
+            c = f.get_char()  # skip _
+        else:
+            f.unget_char(c)
+            break
+    return str(int(s, 16)).encode("utf8")  # ugh
+
+
+def parse_int(f):
+    s = b""
+    c = f.get_char()
+    if c == b"0" and f.peek_char() in b"xX":
+        c = f.get_char()  # skip X
+        return parse_hex_int(f)
+    else:
+        while c != None:
+            if c.isdigit():
+                s += c
+                c = f.get_char()
+            elif c == b"_":
+                c = f.get_char()  # skip _
+            else:
+                f.unget_char(c)
+                break
+        if len(s) == 0:
+            raise ValueError
+        return s
+
+
+def parse_int_signed(f):
+    s = b""
+    c = f.get_char()
+
+    if c == b"-" and f.peek_char().isdigit():
+        return c + parse_int(f)
+    else:
+        if c != b"+":
+            f.unget_char(c)
+        return parse_int(f)
+
+
+def read_str_comma(f):
+    skip_spaces(f)
+    parse_specific_char(f, b",")
+    return b","
+
+
+def read_str_int(f, s):
+    skip_spaces(f)
+    x = int(parse_int_signed(f))
+    optional_specific_string(f, s)
+    return x
+
+
+def read_str_uint(f, s):
+    skip_spaces(f)
+    x = int(parse_int(f))
+    optional_specific_string(f, s)
+    return x
+
+
+def read_str_i8(f):
+    return np.int8(read_str_int(f, "i8"))
+
+
+def read_str_i16(f):
+    return np.int16(read_str_int(f, "i16"))
+
+
+def read_str_i32(f):
+    return np.int32(read_str_int(f, "i32"))
+
+
+def read_str_i64(f):
+    return np.int64(read_str_int(f, "i64"))
+
+
+def read_str_u8(f):
+    return np.uint8(read_str_int(f, "u8"))
+
+
+def read_str_u16(f):
+    return np.uint16(read_str_int(f, "u16"))
+
+
+def read_str_u32(f):
+    return np.uint32(read_str_int(f, "u32"))
+
+
+def read_str_u64(f):
+    return np.uint64(read_str_int(f, "u64"))
+
+
+def read_char(f):
+    skip_spaces(f)
+    parse_specific_char(f, b"'")
+    c = f.get_char()
+    parse_specific_char(f, b"'")
+    return c
+
+
+def read_str_hex_float(f, sign):
+    int_part = parse_hex_int(f)
+    parse_specific_char(f, b".")
+    frac_part = parse_hex_int(f)
+    parse_specific_char(f, b"p")
+    exponent = parse_int(f)
+
+    int_val = int(int_part, 16)
+    frac_val = float(int(frac_part, 16)) / (16 ** len(frac_part))
+    exp_val = int(exponent)
+
+    total_val = (int_val + frac_val) * (2.0**exp_val)
+    if sign == b"-":
+        total_val = -1 * total_val
+
+    return float(total_val)
+
+
+def read_str_decimal(f):
+    skip_spaces(f)
+    c = f.get_char()
+    if c == b"-":
+        sign = b"-"
+    else:
+        f.unget_char(c)
+        sign = b""
+
+    # Check for hexadecimal float
+    c = f.get_char()
+    if c == "0" and (f.peek_char() in ["x", "X"]):
+        f.get_char()
+        return read_str_hex_float(f, sign)
+    else:
+        f.unget_char(c)
+
+    bef = optional(parse_int, f)
+    if bef == None:
+        bef = b"0"
+        parse_specific_char(f, b".")
+        aft = parse_int(f)
+    elif optional(parse_specific_char, f, b"."):
+        aft = parse_int(f)
+    else:
+        aft = b"0"
+    if optional(parse_specific_char, f, b"E") or optional(parse_specific_char, f, b"e"):
+        expt = parse_int_signed(f)
+    else:
+        expt = b"0"
+    return float(sign + bef + b"." + aft + b"E" + expt)
+
+
+def read_str_f16(f):
+    skip_spaces(f)
+    try:
+        parse_specific_string(f, "f16.nan")
+        return np.float32(np.nan)
+    except ValueError:
+        try:
+            parse_specific_string(f, "f16.inf")
+            return np.float32(np.inf)
+        except ValueError:
+            try:
+                parse_specific_string(f, "-f16.inf")
+                return np.float32(-np.inf)
+            except ValueError:
+                x = read_str_decimal(f)
+                optional_specific_string(f, "f16")
+                return x
+
+
+def read_str_f32(f):
+    skip_spaces(f)
+    try:
+        parse_specific_string(f, "f32.nan")
+        return np.float32(np.nan)
+    except ValueError:
+        try:
+            parse_specific_string(f, "f32.inf")
+            return np.float32(np.inf)
+        except ValueError:
+            try:
+                parse_specific_string(f, "-f32.inf")
+                return np.float32(-np.inf)
+            except ValueError:
+                x = read_str_decimal(f)
+                optional_specific_string(f, "f32")
+                return x
+
+
+def read_str_f64(f):
+    skip_spaces(f)
+    try:
+        parse_specific_string(f, "f64.nan")
+        return np.float64(np.nan)
+    except ValueError:
+        try:
+            parse_specific_string(f, "f64.inf")
+            return np.float64(np.inf)
+        except ValueError:
+            try:
+                parse_specific_string(f, "-f64.inf")
+                return np.float64(-np.inf)
+            except ValueError:
+                x = read_str_decimal(f)
+                optional_specific_string(f, "f64")
+                return x
+
+
+def read_str_bool(f):
+    skip_spaces(f)
+    if f.peek_char() == b"t":
+        parse_specific_string(f, "true")
+        return True
+    elif f.peek_char() == b"f":
+        parse_specific_string(f, "false")
+        return False
+    else:
+        raise ValueError
+
+
+def read_str_empty_array(f, type_name, rank):
+    parse_specific_string(f, "empty")
+    parse_specific_char(f, b"(")
+    dims = []
+    for i in range(rank):
+        parse_specific_string(f, "[")
+        dims += [int(parse_int(f))]
+        parse_specific_string(f, "]")
+    if np.prod(dims) != 0:
+        raise ValueError
+    parse_specific_string(f, type_name)
+    parse_specific_char(f, b")")
+
+    return tuple(dims)
+
+
+def read_str_array_elems(f, elem_reader, type_name, rank):
+    skip_spaces(f)
+    try:
+        parse_specific_char(f, b"[")
+    except ValueError:
+        return read_str_empty_array(f, type_name, rank)
+    else:
+        xs = sepBy(elem_reader, read_str_comma, f)
+        skip_spaces(f)
+        parse_specific_char(f, b"]")
+        return xs
+
+
+def read_str_array_helper(f, elem_reader, type_name, rank):
+    def nested_row_reader(_):
+        return read_str_array_helper(f, elem_reader, type_name, rank - 1)
+
+    if rank == 1:
+        row_reader = elem_reader
+    else:
+        row_reader = nested_row_reader
+    return read_str_array_elems(f, row_reader, type_name, rank)
+
+
+def expected_array_dims(l, rank):
+    if rank > 1:
+        n = len(l)
+        if n == 0:
+            elem = []
+        else:
+            elem = l[0]
+        return [n] + expected_array_dims(elem, rank - 1)
+    else:
+        return [len(l)]
+
+
+def verify_array_dims(l, dims):
+    if dims[0] != len(l):
+        raise ValueError
+    if len(dims) > 1:
+        for x in l:
+            verify_array_dims(x, dims[1:])
+
+
+def read_str_array(f, elem_reader, type_name, rank, bt):
+    elems = read_str_array_helper(f, elem_reader, type_name, rank)
+    if type(elems) == tuple:
+        # Empty array
+        return np.empty(elems, dtype=bt)
+    else:
+        dims = expected_array_dims(elems, rank)
+        verify_array_dims(elems, dims)
+        return np.array(elems, dtype=bt)
+
+
+################################################################################
+
+READ_BINARY_VERSION = 2
+
+# struct format specified at
+# https://docs.python.org/2/library/struct.html#format-characters
+
+
+def mk_bin_scalar_reader(t):
+    def bin_reader(f):
+        fmt = FUTHARK_PRIMTYPES[t]["bin_format"]
+        size = FUTHARK_PRIMTYPES[t]["size"]
+        tf = FUTHARK_PRIMTYPES[t]["numpy_type"]
+        return tf(struct.unpack("<" + fmt, f.get_chars(size))[0])
+
+    return bin_reader
+
+
+read_bin_i8 = mk_bin_scalar_reader("i8")
+read_bin_i16 = mk_bin_scalar_reader("i16")
+read_bin_i32 = mk_bin_scalar_reader("i32")
+read_bin_i64 = mk_bin_scalar_reader("i64")
+
+read_bin_u8 = mk_bin_scalar_reader("u8")
+read_bin_u16 = mk_bin_scalar_reader("u16")
+read_bin_u32 = mk_bin_scalar_reader("u32")
+read_bin_u64 = mk_bin_scalar_reader("u64")
+
+read_bin_f16 = mk_bin_scalar_reader("f16")
+read_bin_f32 = mk_bin_scalar_reader("f32")
+read_bin_f64 = mk_bin_scalar_reader("f64")
+
+read_bin_bool = mk_bin_scalar_reader("bool")
+
+
+def read_is_binary(f):
+    skip_spaces(f)
+    c = f.get_char()
+    if c == b"b":
+        bin_version = read_bin_u8(f)
+        if bin_version != READ_BINARY_VERSION:
+            panic(
+                1,
+                "binary-input: File uses version %i, but I only understand version %i.\n",
+                bin_version,
+                READ_BINARY_VERSION,
+            )
+        return True
+    else:
+        f.unget_char(c)
+        return False
+
+
+FUTHARK_PRIMTYPES = {
+    "i8": {
+        "binname": b"  i8",
+        "size": 1,
+        "bin_reader": read_bin_i8,
+        "str_reader": read_str_i8,
+        "bin_format": "b",
+        "numpy_type": np.int8,
+    },
+    "i16": {
+        "binname": b" i16",
+        "size": 2,
+        "bin_reader": read_bin_i16,
+        "str_reader": read_str_i16,
+        "bin_format": "h",
+        "numpy_type": np.int16,
+    },
+    "i32": {
+        "binname": b" i32",
+        "size": 4,
+        "bin_reader": read_bin_i32,
+        "str_reader": read_str_i32,
+        "bin_format": "i",
+        "numpy_type": np.int32,
+    },
+    "i64": {
+        "binname": b" i64",
+        "size": 8,
+        "bin_reader": read_bin_i64,
+        "str_reader": read_str_i64,
+        "bin_format": "q",
+        "numpy_type": np.int64,
+    },
+    "u8": {
+        "binname": b"  u8",
+        "size": 1,
+        "bin_reader": read_bin_u8,
+        "str_reader": read_str_u8,
+        "bin_format": "B",
+        "numpy_type": np.uint8,
+    },
+    "u16": {
+        "binname": b" u16",
+        "size": 2,
+        "bin_reader": read_bin_u16,
+        "str_reader": read_str_u16,
+        "bin_format": "H",
+        "numpy_type": np.uint16,
+    },
+    "u32": {
+        "binname": b" u32",
+        "size": 4,
+        "bin_reader": read_bin_u32,
+        "str_reader": read_str_u32,
+        "bin_format": "I",
+        "numpy_type": np.uint32,
+    },
+    "u64": {
+        "binname": b" u64",
+        "size": 8,
+        "bin_reader": read_bin_u64,
+        "str_reader": read_str_u64,
+        "bin_format": "Q",
+        "numpy_type": np.uint64,
+    },
+    "f16": {
+        "binname": b" f16",
+        "size": 2,
+        "bin_reader": read_bin_f16,
+        "str_reader": read_str_f16,
+        "bin_format": "e",
+        "numpy_type": np.float16,
+    },
+    "f32": {
+        "binname": b" f32",
+        "size": 4,
+        "bin_reader": read_bin_f32,
+        "str_reader": read_str_f32,
+        "bin_format": "f",
+        "numpy_type": np.float32,
+    },
+    "f64": {
+        "binname": b" f64",
+        "size": 8,
+        "bin_reader": read_bin_f64,
+        "str_reader": read_str_f64,
+        "bin_format": "d",
+        "numpy_type": np.float64,
+    },
+    "bool": {
+        "binname": b"bool",
+        "size": 1,
+        "bin_reader": read_bin_bool,
+        "str_reader": read_str_bool,
+        "bin_format": "b",
+        "numpy_type": bool,
+    },
+}
+
+
+def read_bin_read_type(f):
+    read_binname = f.get_chars(4)
+
+    for k, v in FUTHARK_PRIMTYPES.items():
+        if v["binname"] == read_binname:
+            return k
+    panic(1, "binary-input: Did not recognize the type '%s'.\n", read_binname)
+
+
+def numpy_type_to_type_name(t):
+    for k, v in FUTHARK_PRIMTYPES.items():
+        if v["numpy_type"] == t:
+            return k
+    raise Exception(f"Unknown Numpy type: {t}")
+
+
+def read_bin_ensure_scalar(f, expected_type):
+    dims = read_bin_i8(f)
+
+    if dims != 0:
+        panic(
+            1,
+            "binary-input: Expected scalar (0 dimensions), but got array with %i dimensions.\n",
+            dims,
+        )
+
+    bin_type = read_bin_read_type(f)
+    if bin_type != expected_type:
+        panic(
+            1,
+            "binary-input: Expected scalar of type %s but got scalar of type %s.\n",
+            expected_type,
+            bin_type,
+        )
+
+
+# ------------------------------------------------------------------------------
+# General interface for reading Primitive Futhark Values
+# ------------------------------------------------------------------------------
+
+
+def read_scalar(f, ty):
+    if read_is_binary(f):
+        read_bin_ensure_scalar(f, ty)
+        return FUTHARK_PRIMTYPES[ty]["bin_reader"](f)
+    return FUTHARK_PRIMTYPES[ty]["str_reader"](f)
+
+
+def read_array(f, expected_type, rank):
+    if not read_is_binary(f):
+        str_reader = FUTHARK_PRIMTYPES[expected_type]["str_reader"]
+        return read_str_array(
+            f,
+            str_reader,
+            expected_type,
+            rank,
+            FUTHARK_PRIMTYPES[expected_type]["numpy_type"],
+        )
+
+    bin_rank = read_bin_u8(f)
+
+    if bin_rank != rank:
+        panic(
+            1,
+            "binary-input: Expected %i dimensions, but got array with %i dimensions.\n",
+            rank,
+            bin_rank,
+        )
+
+    bin_type_enum = read_bin_read_type(f)
+    if expected_type != bin_type_enum:
+        panic(
+            1,
+            "binary-input: Expected %iD-array with element type '%s' but got %iD-array with element type '%s'.\n",
+            rank,
+            expected_type,
+            bin_rank,
+            bin_type_enum,
+        )
+
+    shape = []
+    elem_count = 1
+    for i in range(rank):
+        bin_size = read_bin_i64(f)
+        elem_count *= bin_size
+        shape.append(bin_size)
+
+    bin_fmt = FUTHARK_PRIMTYPES[bin_type_enum]["bin_format"]
+
+    # We first read the expected number of types into a bytestring,
+    # then use np.frombuffer.  This is because np.fromfile does not
+    # work on things that are insufficiently file-like, like a network
+    # stream.
+    bytes = f.get_chars(elem_count * FUTHARK_PRIMTYPES[expected_type]["size"])
+    arr = np.frombuffer(bytes, dtype=FUTHARK_PRIMTYPES[bin_type_enum]["numpy_type"])
+    arr.shape = shape
+
+    return arr.copy()  # To ensure it is writeable.
+
+
+input_reader = ReaderInput(sys.stdin.buffer)
+
+import re
+
+
+def read_value(type_desc, reader=input_reader):
+    """Read a value of the given type.  The type is a string
+    representation of the Futhark type."""
+    m = re.match(r"((?:\[\])*)([a-z0-9]+)$", type_desc)
+    if m:
+        dims = int(len(m.group(1)) / 2)
+        basetype = m.group(2)
+    assert m and basetype in FUTHARK_PRIMTYPES, f"Unknown type: {type_desc}"
+    if dims > 0:
+        return read_array(reader, basetype, dims)
+    else:
+        return read_scalar(reader, basetype)
+
+
+def end_of_input(entry, f=input_reader):
+    skip_spaces(f)
+    if f.get_char() != b"":
+        panic(1, 'Expected EOF on stdin after reading input for "%s".', entry)
+
+
+def write_value_text(v, out=sys.stdout):
+    if type(v) == np.uint8:
+        out.write("%uu8" % v)
+    elif type(v) == np.uint16:
+        out.write("%uu16" % v)
+    elif type(v) == np.uint32:
+        out.write("%uu32" % v)
+    elif type(v) == np.uint64:
+        out.write("%uu64" % v)
+    elif type(v) == np.int8:
+        out.write("%di8" % v)
+    elif type(v) == np.int16:
+        out.write("%di16" % v)
+    elif type(v) == np.int32:
+        out.write("%di32" % v)
+    elif type(v) == np.int64:
+        out.write("%di64" % v)
+    elif type(v) in [bool, np.bool_]:
+        if v:
+            out.write("true")
+        else:
+            out.write("false")
+    elif type(v) == np.float16:
+        if np.isnan(v):
+            out.write("f16.nan")
+        elif np.isinf(v):
+            if v >= 0:
+                out.write("f16.inf")
+            else:
+                out.write("-f16.inf")
+        else:
+            out.write("%.6ff16" % v)
+    elif type(v) == np.float32:
+        if np.isnan(v):
+            out.write("f32.nan")
+        elif np.isinf(v):
+            if v >= 0:
+                out.write("f32.inf")
+            else:
+                out.write("-f32.inf")
+        else:
+            out.write("%.6ff32" % v)
+    elif type(v) == np.float64:
+        if np.isnan(v):
+            out.write("f64.nan")
+        elif np.isinf(v):
+            if v >= 0:
+                out.write("f64.inf")
+            else:
+                out.write("-f64.inf")
+        else:
+            out.write("%.6ff64" % v)
+    elif type(v) == np.ndarray:
+        if np.prod(v.shape) == 0:
+            tname = numpy_type_to_type_name(v.dtype)
+            out.write("empty({}{})".format("".join([f"[{d}]" for d in v.shape]), tname))
+        else:
+            first = True
+            out.write("[")
+            for x in v:
+                if not first:
+                    out.write(", ")
+                first = False
+                write_value(x, out=out)
+            out.write("]")
+    else:
+        raise Exception(f"Cannot print value of type {type(v)}: {v}")
+
+
+type_strs = {
+    np.dtype("int8"): b"  i8",
+    np.dtype("int16"): b" i16",
+    np.dtype("int32"): b" i32",
+    np.dtype("int64"): b" i64",
+    np.dtype("uint8"): b"  u8",
+    np.dtype("uint16"): b" u16",
+    np.dtype("uint32"): b" u32",
+    np.dtype("uint64"): b" u64",
+    np.dtype("float16"): b" f16",
+    np.dtype("float32"): b" f32",
+    np.dtype("float64"): b" f64",
+    np.dtype("bool"): b"bool",
+}
+
+
+def construct_binary_value(v):
+    t = v.dtype
+    shape = v.shape
+
+    elems = 1
+    for d in shape:
+        elems *= d
+
+    num_bytes = 1 + 1 + 1 + 4 + len(shape) * 8 + elems * t.itemsize
+    bytes = bytearray(num_bytes)
+    bytes[0] = np.int8(ord("b"))
+    bytes[1] = 2
+    bytes[2] = np.int8(len(shape))
+    bytes[3:7] = type_strs[t]
+
+    for i in range(len(shape)):
+        bytes[7 + i * 8 : 7 + (i + 1) * 8] = np.int64(shape[i]).tobytes()
+
+    bytes[7 + len(shape) * 8 :] = np.ascontiguousarray(v).tobytes()
+
+    return bytes
+
+
+def write_value_binary(v, out=sys.stdout):
+    out = out.buffer
+    out.write(construct_binary_value(v))
+
+
+def write_value(v, out=sys.stdout, binary=False):
+    if binary:
+        return write_value_binary(v, out=out)
+    else:
+        return write_value_text(v, out=out)
+
+
+# End of values.py.
+# Start of memory.py.
+
+import ctypes as ct
+
+
+def allocateMem(size):
+    return np.empty(size, dtype=np.byte)
+
+
+# Copy an array if its is not-None.  This is important for treating
+# Numpy arrays as flat memory, but has some overhead.
+def normaliseArray(x):
+    if (x.base is x) or (x.base is None):
+        return x
+    else:
+        return x.copy()
+
+
+def unwrapArray(x):
+    return x.ravel().view(np.byte)
+
+
+def indexArray(x, offset, bt):
+    return x.view(bt)[offset]
+
+
+def writeScalarArray(x, offset, v):
+    x.view(type(v))[offset] = v
+
+
+# An opaque Futhark value.
+class opaque:
+    def __init__(self, desc, *payload):
+        self.data = payload
+        self.desc = desc
+
+    def __repr__(self):
+        return f"<opaque Futhark value of type {self.desc}>"
+
+
+# LMAD stuff
+
+
+def lmad_contiguous_search(checked, expected, strides, shape, used):
+    for i in range(len(strides)):
+        for j in range(len(strides)):
+            if not used[j] and strides[j] == expected and strides[j] >= 0:
+                used[j] = True
+                if checked + 1 == len(strides) or lmad_contiguous_search(
+                    checked + 1, expected * shape[j], strides, shape, used
+                ):
+                    return True
+                used[j] = False
+    return False
+
+
+def lmad_contiguous(strides, shape):
+    used = len(strides) * [False]
+    return lmad_contiguous_search(0, 1, strides, shape, used)
+
+
+def lmad_memcpyable(dst_strides, src_strides, shape):
+    if not lmad_contiguous(dst_strides, shape):
+        return False
+    for i in range(len(dst_strides)):
+        if dst_strides[i] != src_strides[i] and shape[i] != 1:
+            return False
+    return True
+
+
+def lmad_is_tr(strides, shape):
+    r = len(shape)
+    for i in range(1, r):
+        n = 1
+        m = 1
+        ok = True
+        expected = 1
+        # Check strides before 'i'.
+        for j in range(i - 1, -1, -1):
+            ok = ok and strides[j] == expected
+            expected *= shape[j]
+            n *= shape[j]
+        # Check strides after 'i'.
+        for j in range(r - 1, i - 1, -1):
+            ok = ok and strides[j] == expected
+            expected *= shape[j]
+            m *= shape[j]
+        if ok:
+            return (n, m)
+    return None
+
+
+def lmad_map_tr(dst_strides, src_strides, shape):
+    r = len(dst_strides)
+    rowmajor_strides = [0] * r
+    rowmajor_strides[r - 1] = 1
+
+    for i in range(r - 2, -1, -1):
+        rowmajor_strides[i] = rowmajor_strides[i + 1] * shape[i + 1]
+
+    # map_r will be the number of mapped dimensions on top.
+    map_r = 0
+    k = 1
+    for i in range(r):
+        if dst_strides[i] != rowmajor_strides[i] or src_strides[i] != rowmajor_strides[i]:
+            break
+        else:
+            k *= shape[i]
+            map_r += 1
+
+    if rowmajor_strides[map_r:] == dst_strides[map_r:]:
+        r = lmad_is_tr(src_strides[map_r:], shape[map_r:])
+        if r is not None:
+            (n, m) = r
+            return (k, n, m)
+    elif rowmajor_strides[map_r:] == src_strides[map_r:]:
+        r = lmad_is_tr(dst_strides[map_r:], shape[map_r:])
+        if r is not None:
+            (n, m) = r
+            return (k, m, n)  # Sic!
+    return None
+
+
+def lmad_copy_elements(pt, dst, dst_offset, dst_strides, src, src_offset, src_strides, shape):
+    if len(shape) == 1:
+        for i in range(shape[0]):
+            writeScalarArray(
+                dst,
+                dst_offset + i * dst_strides[0],
+                indexArray(src, src_offset + i * src_strides[0], pt),
+            )
+    else:
+        for i in range(shape[0]):
+            lmad_copy_elements(
+                pt,
+                dst,
+                dst_offset + i * dst_strides[0],
+                dst_strides[1:],
+                src,
+                src_offset + i * src_strides[0],
+                src_strides[1:],
+                shape[1:],
+            )
+
+
+def lmad_copy(pt, dst, dst_offset, dst_strides, src, src_offset, src_strides, shape):
+    if lmad_memcpyable(dst_strides, src_strides, shape):
+        dst[
+            dst_offset * ct.sizeof(pt) : dst_offset * ct.sizeof(pt) + np.prod(shape) * ct.sizeof(pt)
+        ] = src[
+            src_offset * ct.sizeof(pt) : src_offset * ct.sizeof(pt) + np.prod(shape) * ct.sizeof(pt)
+        ]
+    else:
+        lmad_copy_elements(
+            pt,
+            dst,
+            dst_offset,
+            dst_strides,
+            src,
+            src_offset,
+            src_strides,
+            shape,
+        )
+
+
+# End of memory.py.
+# Start of panic.py.
+
+
+def panic(exitcode, fmt, *args):
+    sys.stderr.write("%s: " % sys.argv[0])
+    sys.stderr.write(fmt % args)
+    sys.stderr.write("\n")
+    sys.exit(exitcode)
+
+
+# End of panic.py.
+# Start of tuning.py
+
+
+def read_tuning_file(kvs, f):
+    for line in f.read().splitlines():
+        size, value = line.split("=")
+        kvs[size] = int(value)
+    return kvs
+
+
+# End of tuning.py.
+# Start of scalar.py.
+
+import numpy as np
+import math
+import struct
+
+
+def intlit(t, x):
+    if t == np.int8:
+        return np.int8(x)
+    elif t == np.int16:
+        return np.int16(x)
+    elif t == np.int32:
+        return np.int32(x)
+    else:
+        return np.int64(x)
+
+
+def signed(x):
+    if type(x) == np.uint8:
+        return np.int8(x)
+    elif type(x) == np.uint16:
+        return np.int16(x)
+    elif type(x) == np.uint32:
+        return np.int32(x)
+    else:
+        return np.int64(x)
+
+
+def unsigned(x):
+    if type(x) == np.int8:
+        return np.uint8(x)
+    elif type(x) == np.int16:
+        return np.uint16(x)
+    elif type(x) == np.int32:
+        return np.uint32(x)
+    else:
+        return np.uint64(x)
+
+
+def shlN(x, y):
+    return x << y
+
+
+def ashrN(x, y):
+    return x >> y
+
+
+# Python is so slow that we just make all the unsafe operations safe,
+# always.
+
+
+def sdivN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return x // y
+
+
+def sdiv_upN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return (x + y - intlit(type(x), 1)) // y
+
+
+def smodN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return x % y
+
+
+def udivN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return signed(unsigned(x) // unsigned(y))
+
+
+def udiv_upN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return signed((unsigned(x) + unsigned(y) - unsigned(intlit(type(x), 1))) // unsigned(y))
+
+
+def umodN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return signed(unsigned(x) % unsigned(y))
+
+
+def squotN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return np.floor_divide(np.abs(x), np.abs(y)) * np.sign(x) * np.sign(y)
+
+
+def sremN(x, y):
+    if y == 0:
+        return intlit(type(x), 0)
+    else:
+        return np.remainder(np.abs(x), np.abs(y)) * np.sign(x)
+
+
+def sminN(x, y):
+    return min(x, y)
+
+
+def smaxN(x, y):
+    return max(x, y)
+
+
+def uminN(x, y):
+    return signed(min(unsigned(x), unsigned(y)))
+
+
+def umaxN(x, y):
+    return signed(max(unsigned(x), unsigned(y)))
+
+
+def fminN(x, y):
+    return np.fmin(x, y)
+
+
+def fmaxN(x, y):
+    return np.fmax(x, y)
+
+
+def powN(x, y):
+    return x**y
+
+
+def fpowN(x, y):
+    return x**y
+
+
+def sleN(x, y):
+    return x <= y
+
+
+def sltN(x, y):
+    return x < y
+
+
+def uleN(x, y):
+    return unsigned(x) <= unsigned(y)
+
+
+def ultN(x, y):
+    return unsigned(x) < unsigned(y)
+
+
+def lshr8(x, y):
+    return np.int8(np.uint8(x) >> np.uint8(y))
+
+
+def lshr16(x, y):
+    return np.int16(np.uint16(x) >> np.uint16(y))
+
+
+def lshr32(x, y):
+    return np.int32(np.uint32(x) >> np.uint32(y))
+
+
+def lshr64(x, y):
+    return np.int64(np.uint64(x) >> np.uint64(y))
+
+
+def sext_T_i8(x):
+    return np.int8(x)
+
+
+def sext_T_i16(x):
+    return np.int16(x)
+
+
+def sext_T_i32(x):
+    return np.int32(x)
+
+
+def sext_T_i64(x):
+    return np.int64(x)
+
+
+def itob_T_bool(x):
+    return bool(x)
+
+
+def btoi_bool_i8(x):
+    return np.int8(x)
+
+
+def btoi_bool_i16(x):
+    return np.int16(x)
+
+
+def btoi_bool_i32(x):
+    return np.int32(x)
+
+
+def btoi_bool_i64(x):
+    return np.int64(x)
+
+
+def ftob_T_bool(x):
+    return bool(x)
+
+
+def btof_bool_f16(x):
+    return np.float16(x)
+
+
+def btof_bool_f32(x):
+    return np.float32(x)
+
+
+def btof_bool_f64(x):
+    return np.float64(x)
+
+
+def zext_i8_i8(x):
+    return np.int8(np.uint8(x))
+
+
+def zext_i8_i16(x):
+    return np.int16(np.uint8(x))
+
+
+def zext_i8_i32(x):
+    return np.int32(np.uint8(x))
+
+
+def zext_i8_i64(x):
+    return np.int64(np.uint8(x))
+
+
+def zext_i16_i8(x):
+    return np.int8(np.uint16(x))
+
+
+def zext_i16_i16(x):
+    return np.int16(np.uint16(x))
+
+
+def zext_i16_i32(x):
+    return np.int32(np.uint16(x))
+
+
+def zext_i16_i64(x):
+    return np.int64(np.uint16(x))
+
+
+def zext_i32_i8(x):
+    return np.int8(np.uint32(x))
+
+
+def zext_i32_i16(x):
+    return np.int16(np.uint32(x))
+
+
+def zext_i32_i32(x):
+    return np.int32(np.uint32(x))
+
+
+def zext_i32_i64(x):
+    return np.int64(np.uint32(x))
+
+
+def zext_i64_i8(x):
+    return np.int8(np.uint64(x))
+
+
+def zext_i64_i16(x):
+    return np.int16(np.uint64(x))
+
+
+def zext_i64_i32(x):
+    return np.int32(np.uint64(x))
+
+
+def zext_i64_i64(x):
+    return np.int64(np.uint64(x))
+
+
+sdiv8 = sdiv16 = sdiv32 = sdiv64 = sdivN
+sdiv_up8 = sdiv1_up6 = sdiv_up32 = sdiv_up64 = sdiv_upN
+sdiv_safe8 = sdiv1_safe6 = sdiv_safe32 = sdiv_safe64 = sdivN
+sdiv_up_safe8 = sdiv_up1_safe6 = sdiv_up_safe32 = sdiv_up_safe64 = sdiv_upN
+smod8 = smod16 = smod32 = smod64 = smodN
+smod_safe8 = smod_safe16 = smod_safe32 = smod_safe64 = smodN
+udiv8 = udiv16 = udiv32 = udiv64 = udivN
+udiv_up8 = udiv_up16 = udiv_up32 = udiv_up64 = udivN
+udiv_safe8 = udiv_safe16 = udiv_safe32 = udiv_safe64 = udiv_upN
+udiv_up_safe8 = udiv_up_safe16 = udiv_up_safe32 = udiv_up_safe64 = udiv_upN
+umod8 = umod16 = umod32 = umod64 = umodN
+umod_safe8 = umod_safe16 = umod_safe32 = umod_safe64 = umodN
+squot8 = squot16 = squot32 = squot64 = squotN
+squot_safe8 = squot_safe16 = squot_safe32 = squot_safe64 = squotN
+srem8 = srem16 = srem32 = srem64 = sremN
+srem_safe8 = srem_safe16 = srem_safe32 = srem_safe64 = sremN
+
+shl8 = shl16 = shl32 = shl64 = shlN
+ashr8 = ashr16 = ashr32 = ashr64 = ashrN
+smax8 = smax16 = smax32 = smax64 = smaxN
+smin8 = smin16 = smin32 = smin64 = sminN
+umax8 = umax16 = umax32 = umax64 = umaxN
+umin8 = umin16 = umin32 = umin64 = uminN
+pow8 = pow16 = pow32 = pow64 = powN
+fpow16 = fpow32 = fpow64 = fpowN
+fmax16 = fmax32 = fmax64 = fmaxN
+fmin16 = fmin32 = fmin64 = fminN
+sle8 = sle16 = sle32 = sle64 = sleN
+slt8 = slt16 = slt32 = slt64 = sltN
+ule8 = ule16 = ule32 = ule64 = uleN
+ult8 = ult16 = ult32 = ult64 = ultN
+sext_i8_i8 = sext_i16_i8 = sext_i32_i8 = sext_i64_i8 = sext_T_i8
+sext_i8_i16 = sext_i16_i16 = sext_i32_i16 = sext_i64_i16 = sext_T_i16
+sext_i8_i32 = sext_i16_i32 = sext_i32_i32 = sext_i64_i32 = sext_T_i32
+sext_i8_i64 = sext_i16_i64 = sext_i32_i64 = sext_i64_i64 = sext_T_i64
+itob_i8_bool = itob_i16_bool = itob_i32_bool = itob_i64_bool = itob_T_bool
+ftob_f16_bool = ftob_f32_bool = ftob_f64_bool = ftob_T_bool
+
+
+def clz_T(x):
+    n = np.int32(0)
+    bits = x.itemsize * 8
+    for i in range(bits):
+        if x < 0:
+            break
+        n += np.int32(1)
+        x <<= np.int8(1)
+    return n
+
+
+def ctz_T(x):
+    n = np.int32(0)
+    bits = x.itemsize * 8
+    for i in range(bits):
+        if (x & 1) == 1:
+            break
+        n += np.int32(1)
+        x >>= np.int8(1)
+    return n
+
+
+def popc_T(x):
+    c = np.int32(0)
+    while x != 0:
+        x &= x - np.int8(1)
+        c += np.int32(1)
+    return c
+
+
+futhark_popc8 = futhark_popc16 = futhark_popc32 = futhark_popc64 = popc_T
+futhark_clzz8 = futhark_clzz16 = futhark_clzz32 = futhark_clzz64 = clz_T
+futhark_ctzz8 = futhark_ctzz16 = futhark_ctzz32 = futhark_ctzz64 = ctz_T
+
+
+def ssignum(x):
+    return np.sign(x)
+
+
+def usignum(x):
+    if x < 0:
+        return ssignum(-x)
+    else:
+        return ssignum(x)
+
+
+def sitofp_T_f32(x):
+    return np.float32(x)
+
+
+sitofp_i8_f32 = sitofp_i16_f32 = sitofp_i32_f32 = sitofp_i64_f32 = sitofp_T_f32
+
+
+def sitofp_T_f64(x):
+    return np.float64(x)
+
+
+sitofp_i8_f64 = sitofp_i16_f64 = sitofp_i32_f64 = sitofp_i64_f64 = sitofp_T_f64
+
+
+def uitofp_T_f32(x):
+    return np.float32(unsigned(x))
+
+
+uitofp_i8_f32 = uitofp_i16_f32 = uitofp_i32_f32 = uitofp_i64_f32 = uitofp_T_f32
+
+
+def uitofp_T_f64(x):
+    return np.float64(unsigned(x))
+
+
+uitofp_i8_f64 = uitofp_i16_f64 = uitofp_i32_f64 = uitofp_i64_f64 = uitofp_T_f64
+
+
+def fptosi_T_i8(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int8(0)
+    else:
+        return np.int8(np.trunc(x))
+
+
+fptosi_f16_i8 = fptosi_f32_i8 = fptosi_f64_i8 = fptosi_T_i8
+
+
+def fptosi_T_i16(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int16(0)
+    else:
+        return np.int16(np.trunc(x))
+
+
+fptosi_f16_i16 = fptosi_f32_i16 = fptosi_f64_i16 = fptosi_T_i16
+
+
+def fptosi_T_i32(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int32(0)
+    else:
+        return np.int32(np.trunc(x))
+
+
+fptosi_f16_i32 = fptosi_f32_i32 = fptosi_f64_i32 = fptosi_T_i32
+
+
+def fptosi_T_i64(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int64(0)
+    else:
+        return np.int64(np.trunc(x))
+
+
+fptosi_f16_i64 = fptosi_f32_i64 = fptosi_f64_i64 = fptosi_T_i64
+
+
+def fptoui_T_i8(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int8(0)
+    else:
+        return np.int8(np.trunc(x))
+
+
+fptoui_f16_i8 = fptoui_f32_i8 = fptoui_f64_i8 = fptoui_T_i8
+
+
+def fptoui_T_i16(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int16(0)
+    else:
+        return np.int16(np.trunc(x))
+
+
+fptoui_f16_i16 = fptoui_f32_i16 = fptoui_f64_i16 = fptoui_T_i16
+
+
+def fptoui_T_i32(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int32(0)
+    else:
+        return np.int32(np.trunc(x))
+
+
+fptoui_f16_i32 = fptoui_f32_i32 = fptoui_f64_i32 = fptoui_T_i32
+
+
+def fptoui_T_i64(x):
+    if np.isnan(x) or np.isinf(x):
+        return np.int64(0)
+    else:
+        return np.int64(np.trunc(x))
+
+
+fptoui_f16_i64 = fptoui_f32_i64 = fptoui_f64_i64 = fptoui_T_i64
+
+
+def fpconv_f16_f32(x):
+    return np.float32(x)
+
+
+def fpconv_f16_f64(x):
+    return np.float64(x)
+
+
+def fpconv_f32_f16(x):
+    return np.float16(x)
+
+
+def fpconv_f32_f64(x):
+    return np.float64(x)
+
+
+def fpconv_f64_f16(x):
+    return np.float16(x)
+
+
+def fpconv_f64_f32(x):
+    return np.float32(x)
+
+
+def futhark_umul_hi8(a, b):
+    return np.int8((np.uint64(np.uint8(a)) * np.uint64(np.uint8(b))) >> np.uint64(8))
+
+
+def futhark_umul_hi16(a, b):
+    return np.int16((np.uint64(np.uint16(a)) * np.uint64(np.uint16(b))) >> np.uint64(16))
+
+
+def futhark_umul_hi32(a, b):
+    return np.int32((np.uint64(np.uint32(a)) * np.uint64(np.uint32(b))) >> np.uint64(32))
+
+
+def futhark_umul_hi64(a, b):
+    return np.int64(np.uint64(int(np.uint64(a)) * int(np.uint64(b)) >> 64))
+
+
+def futhark_smul_hi8(a, b):
+    return np.int8((np.int64(a) * np.int64(b)) >> np.int64(8))
+
+
+def futhark_smul_hi16(a, b):
+    return np.int16((np.int64(a) * np.int64(b)) >> np.int64(16))
+
+
+def futhark_smul_hi32(a, b):
+    return np.int32((np.int64(a) * np.int64(b)) >> np.int64(32))
+
+
+def futhark_smul_hi64(a, b):
+    return np.int64(int(a) * int(b) >> 64)
+
+
+def futhark_umad_hi8(a, b, c):
+    return futhark_umul_hi8(a, b) + c
+
+
+def futhark_umad_hi16(a, b, c):
+    return futhark_umul_hi16(a, b) + c
+
+
+def futhark_umad_hi32(a, b, c):
+    return futhark_umul_hi32(a, b) + c
+
+
+def futhark_umad_hi64(a, b, c):
+    return futhark_umul_hi64(a, b) + c
+
+
+def futhark_smad_hi8(a, b, c):
+    return futhark_smul_hi8(a, b) + c
+
+
+def futhark_smad_hi16(a, b, c):
+    return futhark_smul_hi16(a, b) + c
+
+
+def futhark_smad_hi32(a, b, c):
+    return futhark_smul_hi32(a, b) + c
+
+
+def futhark_smad_hi64(a, b, c):
+    return futhark_smul_hi64(a, b) + c
+
+
+def futhark_log64(x):
+    return np.float64(np.log(x))
+
+
+def futhark_log2_64(x):
+    return np.float64(np.log2(x))
+
+
+def futhark_log10_64(x):
+    return np.float64(np.log10(x))
+
+
+def futhark_log1p_64(x):
+    return np.float64(np.log1p(x))
+
+
+def futhark_sqrt64(x):
+    return np.sqrt(x)
+
+
+def futhark_cbrt64(x):
+    return np.cbrt(x)
+
+
+def futhark_exp64(x):
+    return np.exp(x)
+
+
+def futhark_cos64(x):
+    return np.cos(x)
+
+
+def futhark_sin64(x):
+    return np.sin(x)
+
+
+def futhark_tan64(x):
+    return np.tan(x)
+
+
+def futhark_acos64(x):
+    return np.arccos(x)
+
+
+def futhark_asin64(x):
+    return np.arcsin(x)
+
+
+def futhark_atan64(x):
+    return np.arctan(x)
+
+
+def futhark_cosh64(x):
+    return np.cosh(x)
+
+
+def futhark_sinh64(x):
+    return np.sinh(x)
+
+
+def futhark_tanh64(x):
+    return np.tanh(x)
+
+
+def futhark_acosh64(x):
+    return np.arccosh(x)
+
+
+def futhark_asinh64(x):
+    return np.arcsinh(x)
+
+
+def futhark_atanh64(x):
+    return np.arctanh(x)
+
+
+def futhark_atan2_64(x, y):
+    return np.arctan2(x, y)
+
+
+def futhark_hypot64(x, y):
+    return np.hypot(x, y)
+
+
+def futhark_gamma64(x):
+    return np.float64(math.gamma(x))
+
+
+def futhark_lgamma64(x):
+    return np.float64(math.lgamma(x))
+
+
+def futhark_erf64(x):
+    return np.float64(math.erf(x))
+
+
+def futhark_erfc64(x):
+    return np.float64(math.erfc(x))
+
+
+def futhark_round64(x):
+    return np.round(x)
+
+
+def futhark_ceil64(x):
+    return np.ceil(x)
+
+
+def futhark_floor64(x):
+    return np.floor(x)
+
+
+def futhark_nextafter64(x, y):
+    return np.nextafter(x, y)
+
+
+def futhark_isnan64(x):
+    return np.isnan(x)
+
+
+def futhark_isinf64(x):
+    return np.isinf(x)
+
+
+def futhark_to_bits64(x):
+    s = struct.pack(">d", x)
+    return np.int64(struct.unpack(">q", s)[0])
+
+
+def futhark_from_bits64(x):
+    s = struct.pack(">q", x)
+    return np.float64(struct.unpack(">d", s)[0])
+
+
+def futhark_log32(x):
+    return np.float32(np.log(x))
+
+
+def futhark_log2_32(x):
+    return np.float32(np.log2(x))
+
+
+def futhark_log10_32(x):
+    return np.float32(np.log10(x))
+
+
+def futhark_log1p_32(x):
+    return np.float32(np.log1p(x))
+
+
+def futhark_sqrt32(x):
+    return np.float32(np.sqrt(x))
+
+
+def futhark_cbrt32(x):
+    return np.float32(np.cbrt(x))
+
+
+def futhark_exp32(x):
+    return np.exp(x)
+
+
+def futhark_cos32(x):
+    return np.cos(x)
+
+
+def futhark_sin32(x):
+    return np.sin(x)
+
+
+def futhark_tan32(x):
+    return np.tan(x)
+
+
+def futhark_acos32(x):
+    return np.arccos(x)
+
+
+def futhark_asin32(x):
+    return np.arcsin(x)
+
+
+def futhark_atan32(x):
+    return np.arctan(x)
+
+
+def futhark_cosh32(x):
+    return np.cosh(x)
+
+
+def futhark_sinh32(x):
+    return np.sinh(x)
+
+
+def futhark_tanh32(x):
+    return np.tanh(x)
+
+
+def futhark_acosh32(x):
+    return np.arccosh(x)
+
+
+def futhark_asinh32(x):
+    return np.arcsinh(x)
+
+
+def futhark_atanh32(x):
+    return np.arctanh(x)
+
+
+def futhark_atan2_32(x, y):
+    return np.arctan2(x, y)
+
+
+def futhark_hypot32(x, y):
+    return np.hypot(x, y)
+
+
+def futhark_gamma32(x):
+    return np.float32(math.gamma(x))
+
+
+def futhark_lgamma32(x):
+    return np.float32(math.lgamma(x))
+
+
+def futhark_erf32(x):
+    return np.float32(math.erf(x))
+
+
+def futhark_erfc32(x):
+    return np.float32(math.erfc(x))
+
+
+def futhark_round32(x):
+    return np.round(x)
+
+
+def futhark_ceil32(x):
+    return np.ceil(x)
+
+
+def futhark_floor32(x):
+    return np.floor(x)
+
+
+def futhark_nextafter32(x, y):
+    return np.nextafter(x, y)
+
+
+def futhark_isnan32(x):
+    return np.isnan(x)
+
+
+def futhark_isinf32(x):
+    return np.isinf(x)
+
+
+def futhark_to_bits32(x):
+    s = struct.pack(">f", x)
+    return np.int32(struct.unpack(">l", s)[0])
+
+
+def futhark_from_bits32(x):
+    s = struct.pack(">l", x)
+    return np.float32(struct.unpack(">f", s)[0])
+
+
+def futhark_log16(x):
+    return np.float16(np.log(x))
+
+
+def futhark_log2_16(x):
+    return np.float16(np.log2(x))
+
+
+def futhark_log10_16(x):
+    return np.float16(np.log10(x))
+
+
+def futhark_log1p_16(x):
+    return np.float16(np.log1p(x))
+
+
+def futhark_sqrt16(x):
+    return np.float16(np.sqrt(x))
+
+
+def futhark_cbrt16(x):
+    return np.float16(np.cbrt(x))
+
+
+def futhark_exp16(x):
+    return np.exp(x)
+
+
+def futhark_cos16(x):
+    return np.cos(x)
+
+
+def futhark_sin16(x):
+    return np.sin(x)
+
+
+def futhark_tan16(x):
+    return np.tan(x)
+
+
+def futhark_acos16(x):
+    return np.arccos(x)
+
+
+def futhark_asin16(x):
+    return np.arcsin(x)
+
+
+def futhark_atan16(x):
+    return np.arctan(x)
+
+
+def futhark_cosh16(x):
+    return np.cosh(x)
+
+
+def futhark_sinh16(x):
+    return np.sinh(x)
+
+
+def futhark_tanh16(x):
+    return np.tanh(x)
+
+
+def futhark_acosh16(x):
+    return np.arccosh(x)
+
+
+def futhark_asinh16(x):
+    return np.arcsinh(x)
+
+
+def futhark_atanh16(x):
+    return np.arctanh(x)
+
+
+def futhark_atan2_16(x, y):
+    return np.arctan2(x, y)
+
+
+def futhark_hypot16(x, y):
+    return np.hypot(x, y)
+
+
+def futhark_gamma16(x):
+    return np.float16(math.gamma(x))
+
+
+def futhark_lgamma16(x):
+    return np.float16(math.lgamma(x))
+
+
+def futhark_erf16(x):
+    return np.float16(math.erf(x))
+
+
+def futhark_erfc16(x):
+    return np.float16(math.erfc(x))
+
+
+def futhark_round16(x):
+    return np.round(x)
+
+
+def futhark_ceil16(x):
+    return np.ceil(x)
+
+
+def futhark_floor16(x):
+    return np.floor(x)
+
+
+def futhark_nextafter16(x, y):
+    return np.nextafter(x, y)
+
+
+def futhark_isnan16(x):
+    return np.isnan(x)
+
+
+def futhark_isinf16(x):
+    return np.isinf(x)
+
+
+def futhark_to_bits16(x):
+    s = struct.pack(">e", x)
+    return np.int16(struct.unpack(">H", s)[0])
+
+
+def futhark_from_bits16(x):
+    s = struct.pack(">H", np.uint16(x))
+    return np.float16(struct.unpack(">e", s)[0])
+
+
+def futhark_lerp16(v0, v1, t):
+    return v0 + (v1 - v0) * t
+
+
+def futhark_lerp32(v0, v1, t):
+    return v0 + (v1 - v0) * t
+
+
+def futhark_lerp64(v0, v1, t):
+    return v0 + (v1 - v0) * t
+
+
+def futhark_ldexp16(x, y):
+    return np.ldexp(x, y)
+
+
+def futhark_ldexp32(x, y):
+    return np.ldexp(x, y)
+
+
+def futhark_ldexp64(x, y):
+    return np.ldexp(x, y)
+
+
+def futhark_mad16(a, b, c):
+    return a * b + c
+
+
+def futhark_mad32(a, b, c):
+    return a * b + c
+
+
+def futhark_mad64(a, b, c):
+    return a * b + c
+
+
+def futhark_fma16(a, b, c):
+    return a * b + c
+
+
+def futhark_fma32(a, b, c):
+    return a * b + c
+
+
+def futhark_fma64(a, b, c):
+    return a * b + c
+
+
+futhark_copysign16 = futhark_copysign32 = futhark_copysign64 = np.copysign
+
+# End of scalar.py.
+# Start of server.py
+
+import sys
+import time
+import shlex  # For string splitting
+
+
+class Server:
+    def __init__(self, ctx):
+        self._ctx = ctx
+        self._vars = {}
+
+    class Failure(BaseException):
+        def __init__(self, msg):
+            self.msg = msg
+
+    def _get_arg(self, args, i):
+        if i < len(args):
+            return args[i]
+        else:
+            raise self.Failure("Insufficient command args")
+
+    def _get_entry_point(self, entry):
+        if entry in self._ctx.entry_points:
+            return self._ctx.entry_points[entry]
+        else:
+            raise self.Failure("Unknown entry point: %s" % entry)
+
+    def _check_var(self, vname):
+        if not vname in self._vars:
+            raise self.Failure("Unknown variable: %s" % vname)
+
+    def _check_new_var(self, vname):
+        if vname in self._vars:
+            raise self.Failure("Variable already exists: %s" % vname)
+
+    def _get_var(self, vname):
+        self._check_var(vname)
+        return self._vars[vname]
+
+    def _cmd_inputs(self, args):
+        entry = self._get_arg(args, 0)
+        for t in self._get_entry_point(entry)[1]:
+            print(t)
+
+    def _cmd_outputs(self, args):
+        entry = self._get_arg(args, 0)
+        for t in self._get_entry_point(entry)[2]:
+            print(t)
+
+    def _cmd_dummy(self, args):
+        pass
+
+    def _cmd_free(self, args):
+        for vname in args:
+            self._check_var(vname)
+            del self._vars[vname]
+
+    def _cmd_rename(self, args):
+        oldname = self._get_arg(args, 0)
+        newname = self._get_arg(args, 1)
+        self._check_var(oldname)
+        self._check_new_var(newname)
+        self._vars[newname] = self._vars[oldname]
+        del self._vars[oldname]
+
+    def _cmd_call(self, args):
+        entry = self._get_entry_point(self._get_arg(args, 0))
+        entry_fname = entry[0]
+        num_ins = len(entry[1])
+        num_outs = len(entry[2])
+        exp_len = 1 + num_outs + num_ins
+
+        if len(args) != exp_len:
+            raise self.Failure("Invalid argument count, expected %d" % exp_len)
+
+        out_vnames = args[1 : num_outs + 1]
+
+        for out_vname in out_vnames:
+            self._check_new_var(out_vname)
+
+        in_vnames = args[1 + num_outs :]
+        ins = [self._get_var(in_vname) for in_vname in in_vnames]
+
+        try:
+            (runtime, vals) = getattr(self._ctx, entry_fname)(*ins)
+        except Exception as e:
+            raise self.Failure(str(e))
+
+        print("runtime: %d" % runtime)
+
+        if num_outs == 1:
+            self._vars[out_vnames[0]] = vals
+        else:
+            for out_vname, val in zip(out_vnames, vals):
+                self._vars[out_vname] = val
+
+    def _store_val(self, f, value):
+        # In case we are using the PyOpenCL backend, we first
+        # need to convert OpenCL arrays to ordinary NumPy
+        # arrays.  We do this in a nasty way.
+        if isinstance(value, opaque):
+            for component in value.data:
+                self._store_val(f, component)
+        elif (
+            isinstance(value, np.number)
+            or isinstance(value, bool)
+            or isinstance(value, np.bool_)
+            or isinstance(value, np.ndarray)
+        ):
+            # Ordinary NumPy value.
+            f.write(construct_binary_value(value))
+        else:
+            # Assuming PyOpenCL array.
+            f.write(construct_binary_value(value.get()))
+
+    def _cmd_store(self, args):
+        fname = self._get_arg(args, 0)
+
+        with open(fname, "wb") as f:
+            for i in range(1, len(args)):
+                self._store_val(f, self._get_var(args[i]))
+
+    def _restore_val(self, reader, typename):
+        if typename in self._ctx.opaques:
+            vs = []
+            for t in self._ctx.opaques[typename]:
+                vs += [read_value(t, reader)]
+            return opaque(typename, *vs)
+        else:
+            return read_value(typename, reader)
+
+    def _cmd_restore(self, args):
+        if len(args) % 2 == 0:
+            raise self.Failure("Invalid argument count")
+
+        fname = args[0]
+        args = args[1:]
+
+        with open(fname, "rb") as f:
+            reader = ReaderInput(f)
+            while args != []:
+                vname = args[0]
+                typename = args[1]
+                args = args[2:]
+
+                if vname in self._vars:
+                    raise self.Failure("Variable already exists: %s" % vname)
+
+                try:
+                    self._vars[vname] = self._restore_val(reader, typename)
+                except ValueError:
+                    raise self.Failure(
+                        "Failed to restore variable %s.\n"
+                        "Possibly malformed data in %s.\n" % (vname, fname)
+                    )
+
+            skip_spaces(reader)
+            if reader.get_char() != b"":
+                raise self.Failure("Expected EOF after reading values")
+
+    def _cmd_types(self, args):
+        for k in self._ctx.opaques.keys():
+            print(k)
+
+    def _cmd_entry_points(self, args):
+        for k in self._ctx.entry_points.keys():
+            print(k)
+
+    _commands = {
+        "inputs": _cmd_inputs,
+        "outputs": _cmd_outputs,
+        "call": _cmd_call,
+        "restore": _cmd_restore,
+        "store": _cmd_store,
+        "free": _cmd_free,
+        "rename": _cmd_rename,
+        "clear": _cmd_dummy,
+        "pause_profiling": _cmd_dummy,
+        "unpause_profiling": _cmd_dummy,
+        "report": _cmd_dummy,
+        "types": _cmd_types,
+        "entry_points": _cmd_entry_points,
+    }
+
+    def _process_line(self, line):
+        lex = shlex.shlex(line)
+        lex.quotes = '"'
+        lex.whitespace_split = True
+        lex.commenters = ""
+        words = list(lex)
+        if words == []:
+            raise self.Failure("Empty line")
+        else:
+            cmd = words[0]
+            args = words[1:]
+            if cmd in self._commands:
+                self._commands[cmd](self, args)
+            else:
+                raise self.Failure("Unknown command: %s" % cmd)
+
+    def run(self):
+        while True:
+            print("%%% OK", flush=True)
+            line = sys.stdin.readline()
+            if line == "":
+                return
+            try:
+                self._process_line(line)
+            except self.Failure as e:
+                print("%%% FAILURE")
+                print(e.msg)
+
+
+# End of server.py
+class entropy:
+    entry_points = {
+        "byte_histogram": ("byte_histogram", ["[]u8"], ["[]i64"]),
+        "chunked_entropy": ("chunked_entropy", ["i64", "[]u8"], ["[]u8"]),
+        "entropy": ("entropy", ["[]u8"], ["f32"]),
+    }
+    opaques = {}
+
+    def __init__(
+        self,
+        build_options=build_options,
+        command_queue=None,
+        interactive=False,
+        platform_pref=preferred_platform,
+        device_pref=preferred_device,
+        default_group_size=default_group_size,
+        default_num_groups=default_num_groups,
+        default_tile_size=default_tile_size,
+        default_reg_tile_size=default_reg_tile_size,
+        default_threshold=default_threshold,
+        sizes=sizes,
+    ):
+        size_heuristics = [
+            ("NVIDIA CUDA", cl.device_type.GPU, "lockstep_width", lambda device: np.int32(32)),
+            (
+                "AMD Accelerated Parallel Processing",
+                cl.device_type.GPU,
+                "lockstep_width",
+                lambda device: np.int32(32),
+            ),
+            ("", cl.device_type.GPU, "lockstep_width", lambda device: np.int32(1)),
+            (
+                "",
+                cl.device_type.GPU,
+                "num_groups",
+                lambda device: (
+                    np.int32(4) * device.get_info(getattr(cl.device_info, "MAX_COMPUTE_UNITS"))
+                ),
+            ),
+            ("", cl.device_type.GPU, "group_size", lambda device: np.int32(256)),
+            ("", cl.device_type.GPU, "tile_size", lambda device: np.int32(16)),
+            ("", cl.device_type.GPU, "reg_tile_size", lambda device: np.int32(4)),
+            ("", cl.device_type.GPU, "threshold", lambda device: np.int32(32768)),
+            ("", cl.device_type.CPU, "lockstep_width", lambda device: np.int32(1)),
+            (
+                "",
+                cl.device_type.CPU,
+                "num_groups",
+                lambda device: device.get_info(getattr(cl.device_info, "MAX_COMPUTE_UNITS")),
+            ),
+            ("", cl.device_type.CPU, "group_size", lambda device: np.int32(32)),
+            ("", cl.device_type.CPU, "tile_size", lambda device: np.int32(4)),
+            ("", cl.device_type.CPU, "reg_tile_size", lambda device: np.int32(1)),
+            (
+                "",
+                cl.device_type.CPU,
+                "threshold",
+                lambda device: device.get_info(getattr(cl.device_info, "MAX_COMPUTE_UNITS")),
+            ),
+        ]
+        self.global_failure_args_max = 3
+        self.failure_msgs = [
+            "Index [{}:{}] out of bounds for array of shape [{}].\n-> #0  ofrak_gpu/entropy.fut:13:44-83\n   #1  /prelude/functional.fut:9:44-45\n   #2  ofrak_gpu/entropy.fut:13:8-93\n   #3  ofrak_gpu/entropy.fut:11:1-13:93\n",
+            "Index [{}:{}] out of bounds for array of shape [{}].\n-> #0  ofrak_gpu/entropy.fut:13:44-83\n   #1  /prelude/functional.fut:9:44-45\n   #2  ofrak_gpu/entropy.fut:13:8-93\n   #3  ofrak_gpu/entropy.fut:11:1-13:93\n",
+        ]
+        constants = [
+            (
+                "entropyzisegred_nonseg_6344_dim1",
+                lambda: self.sizes["entropy.segred_tblock_size_6336"],
+            ),
+            (
+                "entropyzisegred_nonseg_6344zisegred_tblock_sizze_6337",
+                lambda: self.sizes["entropy.segred_tblock_size_6336"],
+            ),
+            ("entropyzisegred_nonseg_6344zichunk_sizze_6996", lambda: np.int64(1)),
+            (
+                "entropyzisegred_large_6901_dim1",
+                lambda: self.sizes["entropy.seghist_tblock_size_6320"],
+            ),
+            (
+                "entropyzisegred_large_6901ziseghist_tblock_sizze_6321",
+                lambda: self.sizes["entropy.seghist_tblock_size_6320"],
+            ),
+            ("entropyzisegred_large_6901zichunk_sizze_6902", lambda: np.int64(1)),
+            (
+                "entropyzisegred_small_6901_dim1",
+                lambda: self.sizes["entropy.seghist_tblock_size_6320"],
+            ),
+            (
+                "entropyzisegred_small_6901ziseghist_tblock_sizze_6321",
+                lambda: self.sizes["entropy.seghist_tblock_size_6320"],
+            ),
+            (
+                "entropyziseghist_global_6328_dim1",
+                lambda: self.sizes["entropy.seghist_tblock_size_6320"],
+            ),
+            (
+                "entropyziseghist_global_6328ziseghist_tblock_sizze_6321",
+                lambda: self.sizes["entropy.seghist_tblock_size_6320"],
+            ),
+            ("entropyziseghist_local_6328_dim1", lambda: self.max_thread_block_size),
+            (
+                "entropyziseghist_local_6328zimax_tblock_sizze_6825",
+                lambda: self.max_thread_block_size,
+            ),
+            (
+                "chunked_entropyzisegmap_6687_dim1",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6456"],
+            ),
+            (
+                "chunked_entropyzisegmap_6687zisegmap_tblock_sizze_6683",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6456"],
+            ),
+            (
+                "chunked_entropyzisegred_large_6669_dim1",
+                lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"],
+            ),
+            (
+                "chunked_entropyzisegred_large_6669zisegred_tblock_sizze_6663",
+                lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"],
+            ),
+            ("chunked_entropyzisegred_large_6669zichunk_sizze_6833", lambda: np.int64(1)),
+            (
+                "chunked_entropyzisegred_small_6669_dim1",
+                lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"],
+            ),
+            (
+                "chunked_entropyzisegred_small_6669zisegred_tblock_sizze_6663",
+                lambda: self.sizes["chunked_entropy.segred_tblock_size_6473"],
+            ),
+            (
+                "chunked_entropyzisegmap_6645_dim1",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6523"],
+            ),
+            (
+                "chunked_entropyzisegmap_6645zisegmap_tblock_sizze_6639",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6523"],
+            ),
+            (
+                "chunked_entropyzisegmap_6606_dim1",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6577"],
+            ),
+            (
+                "chunked_entropyzisegmap_6606zisegmap_tblock_sizze_6600",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6577"],
+            ),
+            (
+                "chunked_entropyzisegmap_6405_dim1",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6350"],
+            ),
+            (
+                "chunked_entropyzisegmap_6405zisegmap_tblock_sizze_6401",
+                lambda: self.sizes["chunked_entropy.segmap_tblock_size_6350"],
+            ),
+            (
+                "byte_histogramzisegred_large_6901_dim1",
+                lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"],
+            ),
+            (
+                "byte_histogramzisegred_large_6901ziseghist_tblock_sizze_6305",
+                lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"],
+            ),
+            ("byte_histogramzisegred_large_6901zichunk_sizze_6902", lambda: np.int64(1)),
+            (
+                "byte_histogramzisegred_small_6901_dim1",
+                lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"],
+            ),
+            (
+                "byte_histogramzisegred_small_6901ziseghist_tblock_sizze_6305",
+                lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"],
+            ),
+            (
+                "byte_histogramziseghist_global_6312_dim1",
+                lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"],
+            ),
+            (
+                "byte_histogramziseghist_global_6312ziseghist_tblock_sizze_6305",
+                lambda: self.sizes["byte_histogram.seghist_tblock_size_6304"],
+            ),
+            ("byte_histogramziseghist_local_6312_dim1", lambda: self.max_thread_block_size),
+            (
+                "byte_histogramziseghist_local_6312zimax_tblock_sizze_6825",
+                lambda: self.max_thread_block_size,
+            ),
+        ]
+        program = initialise_opencl_object(
+            self,
+            program_src=fut_opencl_src,
+            build_options=build_options,
+            command_queue=command_queue,
+            interactive=interactive,
+            platform_pref=platform_pref,
+            device_pref=device_pref,
+            default_group_size=default_group_size,
+            default_num_groups=default_num_groups,
+            default_tile_size=default_tile_size,
+            default_reg_tile_size=default_reg_tile_size,
+            default_threshold=default_threshold,
+            size_heuristics=size_heuristics,
+            required_types=["i8", "i32", "i64", "f32", "bool", "unit"],
+            user_sizes=sizes,
+            all_sizes={
+                "builtin#replicate_i32.tblock_size_6879": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "builtin#replicate_i64.tblock_size_6803": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "byte_histogram.seghist_num_tblocks_6306": {"class": "grid_size", "value": None},
+                "byte_histogram.seghist_tblock_size_6304": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "chunked_entropy.segmap_num_tblocks_6525": {"class": "grid_size", "value": None},
+                "chunked_entropy.segmap_tblock_size_6350": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "chunked_entropy.segmap_tblock_size_6456": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "chunked_entropy.segmap_tblock_size_6523": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "chunked_entropy.segmap_tblock_size_6577": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "chunked_entropy.segred_num_tblocks_6475": {"class": "grid_size", "value": None},
+                "chunked_entropy.segred_tblock_size_6473": {
+                    "class": "thread_block_size",
+                    "value": None,
+                },
+                "chunked_entropy.suff_outer_par_0": {"class": "threshold(def, )", "value": None},
+                "entropy.seghist_num_tblocks_6322": {"class": "grid_size", "value": None},
+                "entropy.seghist_tblock_size_6320": {"class": "thread_block_size", "value": None},
+                "entropy.segred_num_tblocks_6338": {"class": "grid_size", "value": None},
+                "entropy.segred_tblock_size_6336": {"class": "thread_block_size", "value": None},
+            },
+            constants=constants,
+        )
+        self.builtinzhreplicate_i32zireplicate_6875_var = (
+            program.builtinzhreplicate_i32zireplicate_6875
+        )
+        self.builtinzhreplicate_i64zireplicate_6799_var = (
+            program.builtinzhreplicate_i64zireplicate_6799
+        )
+        self.byte_histogramziseghist_global_6312_var = program.byte_histogramziseghist_global_6312
+        self.byte_histogramziseghist_local_6312_var = program.byte_histogramziseghist_local_6312
+        self.byte_histogramzisegred_large_6901_var = program.byte_histogramzisegred_large_6901
+        self.byte_histogramzisegred_small_6901_var = program.byte_histogramzisegred_small_6901
+        self.chunked_entropyzisegmap_6405_var = program.chunked_entropyzisegmap_6405
+        self.chunked_entropyzisegmap_6606_var = program.chunked_entropyzisegmap_6606
+        self.chunked_entropyzisegmap_6645_var = program.chunked_entropyzisegmap_6645
+        self.chunked_entropyzisegmap_6687_var = program.chunked_entropyzisegmap_6687
+        self.chunked_entropyzisegred_large_6669_var = program.chunked_entropyzisegred_large_6669
+        self.chunked_entropyzisegred_small_6669_var = program.chunked_entropyzisegred_small_6669
+        self.entropyziseghist_global_6328_var = program.entropyziseghist_global_6328
+        self.entropyziseghist_local_6328_var = program.entropyziseghist_local_6328
+        self.entropyzisegred_large_6901_var = program.entropyzisegred_large_6901
+        self.entropyzisegred_nonseg_6344_var = program.entropyzisegred_nonseg_6344
+        self.entropyzisegred_small_6901_var = program.entropyzisegred_small_6901
+        self.constants = {}
+        self.constants["counters_mem_6938"] = opencl_alloc(
+            self, np.int64(81920), 'self.constants["counters_mem_6938"]'
+        )
+        self.futhark_builtinzhreplicate_i32(
+            self.constants["counters_mem_6938"], np.int64(20480), np.int32(0)
+        )
+        self.constants["counters_mem_6938"] = opencl_alloc(
+            self, np.int64(81920), 'self.constants["counters_mem_6938"]'
+        )
+        self.futhark_builtinzhreplicate_i32(
+            self.constants["counters_mem_6938"], np.int64(20480), np.int32(0)
+        )
+        self.constants["counters_mem_6997"] = opencl_alloc(
+            self, np.int64(80), 'self.constants["counters_mem_6997"]'
+        )
+        self.futhark_builtinzhreplicate_i32(
+            self.constants["counters_mem_6997"], np.int64(20), np.int32(0)
+        )
+        self.constants["counters_mem_6868"] = opencl_alloc(
+            self, np.int64(81920), 'self.constants["counters_mem_6868"]'
+        )
+        self.futhark_builtinzhreplicate_i32(
+            self.constants["counters_mem_6868"], np.int64(20480), np.int32(0)
+        )
+
+    def futhark_builtinzhreplicate_i32(self, mem_6870, num_elems_6871, val_6872):
+        replicate_n_6874 = num_elems_6871
+        tblock_sizze_6879 = self.sizes["builtin#replicate_i32.tblock_size_6879"]
+        virt_num_tblocks_6880 = sdiv_up64(replicate_n_6874, tblock_sizze_6879)
+        num_tblocks_6881 = smin64(virt_num_tblocks_6880, np.int64(1048576))
+        if (1 * (np.int64(num_tblocks_6881) * np.int64(tblock_sizze_6879))) != 0:
+            self.builtinzhreplicate_i32zireplicate_6875_var.set_args(
+                cl.LocalMemory(max(np.int64(0), 1)),
+                ct.c_int64(num_elems_6871),
+                ct.c_int32(val_6872),
+                ct.c_int64(replicate_n_6874),
+                ct.c_int64(virt_num_tblocks_6880),
+                ct.c_int64(num_tblocks_6881),
+                mem_6870,
+            )
+            cl.enqueue_nd_range_kernel(
+                self.queue,
+                self.builtinzhreplicate_i32zireplicate_6875_var,
+                ((np.int64(num_tblocks_6881) * np.int64(tblock_sizze_6879)),),
+                (np.int64(tblock_sizze_6879),),
+            )
+            if synchronous:
+                sync(self)
+        return ()
+
+    def futhark_builtinzhreplicate_i64(self, mem_6794, num_elems_6795, val_6796):
+        replicate_n_6798 = num_elems_6795
+        tblock_sizze_6803 = self.sizes["builtin#replicate_i64.tblock_size_6803"]
+        virt_num_tblocks_6804 = sdiv_up64(replicate_n_6798, tblock_sizze_6803)
+        num_tblocks_6805 = smin64(virt_num_tblocks_6804, np.int64(1048576))
+        if (1 * (np.int64(num_tblocks_6805) * np.int64(tblock_sizze_6803))) != 0:
+            self.builtinzhreplicate_i64zireplicate_6799_var.set_args(
+                cl.LocalMemory(max(np.int64(0), 1)),
+                ct.c_int64(num_elems_6795),
+                ct.c_int64(val_6796),
+                ct.c_int64(replicate_n_6798),
+                ct.c_int64(virt_num_tblocks_6804),
+                ct.c_int64(num_tblocks_6805),
+                mem_6794,
+            )
+            cl.enqueue_nd_range_kernel(
+                self.queue,
+                self.builtinzhreplicate_i64zireplicate_6799_var,
+                ((np.int64(num_tblocks_6805) * np.int64(tblock_sizze_6803)),),
+                (np.int64(tblock_sizze_6803),),
+            )
+            if synchronous:
+                sync(self)
+        return ()
+
+    def futhark_entry_byte_histogram(self, xs_mem_6757, n_5765):
+        mem_6758 = opencl_alloc(self, np.int64(2048), "mem_6758")
+        self.futhark_builtinzhreplicate_i64(mem_6758, np.int64(256), np.int64(0))
+        seghist_tblock_sizze_6305 = self.sizes["byte_histogram.seghist_tblock_size_6304"]
+        max_num_tblocks_6814 = self.sizes["byte_histogram.seghist_num_tblocks_6306"]
+        num_tblocks_6307 = sext_i64_i32(
+            smax64(
+                np.int64(1),
+                smin64(sdiv_up64(n_5765, seghist_tblock_sizze_6305), max_num_tblocks_6814),
+            )
+        )
+        h_6818 = np.int64(2048)
+        seg_h_6819 = np.int64(2048)
+        if seg_h_6819 == np.int64(0):
+            pass
+        else:
+            hist_H_6820 = np.int64(256)
+            hist_el_sizze_6821 = sdiv_up64(h_6818, hist_H_6820)
+            hist_N_6822 = n_5765
+            hist_RF_6823 = np.int32(1)
+            hist_L_6824 = self.max_shared_memory
+            max_tblock_sizze_6825 = self.max_thread_block_size
+            num_tblocks_6826 = sdiv_up64(
+                sext_i32_i64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)),
+                max_tblock_sizze_6825,
+            )
+            hist_m_prime_6827 = sitofp_i64_f64(
+                smin64(
+                    squot64(hist_L_6824, hist_el_sizze_6821),
+                    sdiv_up64(hist_N_6822, num_tblocks_6826),
+                )
+            ) / sitofp_i64_f64(hist_H_6820)
+            hist_M0_6828 = smax64(
+                np.int64(1), smin64(fptosi_f64_i64(hist_m_prime_6827), max_tblock_sizze_6825)
+            )
+            hist_Nout_6829 = np.int64(1)
+            hist_Nin_6830 = n_5765
+            work_asymp_M_max_6831 = squot64(
+                (hist_Nout_6829 * hist_N_6822), ((np.int64(2) * num_tblocks_6826) * hist_H_6820)
+            )
+            hist_M_6832 = sext_i64_i32(smin64(hist_M0_6828, work_asymp_M_max_6831))
+            hist_C_6833 = sdiv_up64(
+                max_tblock_sizze_6825, sext_i32_i64(smax32(np.int32(1), hist_M_6832))
+            )
+            local_mem_needed_6834 = hist_el_sizze_6821 * sext_i32_i64(hist_M_6832)
+            hist_S_6835 = sext_i64_i32(
+                sdiv_up64(((hist_H_6820 * local_mem_needed_6834) + np.int64(1)), hist_L_6824)
+            )
+            if sle64(hist_H_6820, hist_Nin_6830) and (
+                sle64(local_mem_needed_6834, hist_L_6824)
+                and (
+                    sle32(hist_S_6835, np.int32(3))
+                    and (
+                        sle64(hist_C_6833, max_tblock_sizze_6825)
+                        and slt32(np.int32(0), hist_M_6832)
+                    )
+                )
+            ):
+                num_subhistos_6815 = num_tblocks_6826
+                if num_subhistos_6815 == np.int64(1):
+                    defunc_0_map_res_subhistos_mem_6816 = mem_6758
+                else:
+                    defunc_0_map_res_subhistos_mem_6816 = opencl_alloc(
+                        self,
+                        ((num_subhistos_6815 * np.int64(256)) * np.int64(8)),
+                        "defunc_0_map_res_subhistos_mem_6816",
+                    )
+                    self.futhark_builtinzhreplicate_i64(
+                        defunc_0_map_res_subhistos_mem_6816,
+                        (num_subhistos_6815 * np.int64(256)),
+                        np.int64(0),
+                    )
+                    lmad_copy_gpu2gpu(
+                        self,
+                        ct.c_int64,
+                        defunc_0_map_res_subhistos_mem_6816,
+                        np.int64(0),
+                        [np.int64(1)],
+                        mem_6758,
+                        np.int64(0),
+                        [np.int64(1)],
+                        [np.int64(256)],
+                    )
+                chk_i_6836 = np.int32(0)
+                one_7031 = np.int32(1)
+                for counter_7030 in range(hist_S_6835):
+                    num_segments_6837 = np.int64(1)
+                    hist_H_chk_6838 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6835))
+                    histo_sizze_6839 = hist_H_chk_6838
+                    init_per_thread_6840 = sext_i64_i32(
+                        sdiv_up64(
+                            (sext_i32_i64(hist_M_6832) * histo_sizze_6839), max_tblock_sizze_6825
+                        )
+                    )
+                    if (1 * (np.int64(num_tblocks_6826) * self.max_thread_block_size)) != 0:
+                        self.byte_histogramziseghist_local_6312_var.set_args(
+                            cl.LocalMemory(
+                                max(
+                                    (
+                                        (np.int64(8) * (hist_M_6832 * hist_H_chk_6838))
+                                        + srem64(
+                                            (
+                                                np.int64(8)
+                                                - srem64(
+                                                    (np.int64(8) * (hist_M_6832 * hist_H_chk_6838)),
+                                                    np.int64(8),
+                                                )
+                                            ),
+                                            np.int64(8),
+                                        )
+                                    ),
+                                    1,
+                                )
+                            ),
+                            self.global_failure,
+                            ct.c_int64(n_5765),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int64(num_tblocks_6826),
+                            ct.c_int32(hist_M_6832),
+                            ct.c_int32(chk_i_6836),
+                            ct.c_int64(num_segments_6837),
+                            ct.c_int64(hist_H_chk_6838),
+                            ct.c_int64(histo_sizze_6839),
+                            ct.c_int32(init_per_thread_6840),
+                            xs_mem_6757,
+                            defunc_0_map_res_subhistos_mem_6816,
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.byte_histogramziseghist_local_6312_var,
+                            ((np.int64(num_tblocks_6826) * self.max_thread_block_size),),
+                            (self.max_thread_block_size,),
+                        )
+                        if synchronous:
+                            sync(self)
+                    chk_i_6836 += one_7031
+            else:
+                hist_H_6872 = np.int64(256)
+                hist_RF_6873 = (np.float64(0.0) + sitofp_i32_f64(np.int64(1))) / np.float64(1.0)
+                hist_el_sizze_6874 = np.int32(8)
+                hist_C_max_6875 = fmin64(
+                    sitofp_i32_f64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)),
+                    (sitofp_i32_f64(hist_H_6872) / np.float64(2.0)),
+                )
+                hist_M_min_6876 = smax32(
+                    np.int32(1),
+                    sext_i64_i32(
+                        fptosi_f64_i64(
+                            sitofp_i32_f64(
+                                sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)
+                            )
+                            / hist_C_max_6875
+                        )
+                    ),
+                )
+                L2_sizze_6877 = self.max_cache
+                hist_RACE_exp_6878 = fmax64(
+                    np.float64(1.0),
+                    (
+                        (np.float64(0.75) * hist_RF_6873)
+                        / (np.float64(64.0) / sitofp_i32_f64(hist_el_sizze_6874))
+                    ),
+                )
+                if slt64(n_5765, hist_H_6872):
+                    hist_S_6879 = np.int32(1)
+                else:
+                    hist_S_6879 = sext_i64_i32(
+                        sdiv_up64(
+                            (
+                                (sext_i32_i64(hist_M_min_6876) * hist_H_6872)
+                                * sext_i32_i64(hist_el_sizze_6874)
+                            ),
+                            fptosi_f64_i64(
+                                (np.float64(0.4) * sitofp_i32_f64(L2_sizze_6877))
+                                * hist_RACE_exp_6878
+                            ),
+                        )
+                    )
+                hist_H_chk_6880 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879))
+                hist_k_max_6881 = fmin64(
+                    (
+                        (
+                            np.float64(0.4)
+                            * (sitofp_i32_f64(L2_sizze_6877) / sitofp_i32_f64(np.int32(8)))
+                        )
+                        * hist_RACE_exp_6878
+                    ),
+                    sitofp_i32_f64(n_5765),
+                ) / sitofp_i32_f64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305))
+                hist_u_6882 = np.int64(2)
+                hist_C_6883 = fmin64(
+                    sitofp_i32_f64(sext_i64_i32(num_tblocks_6307 * seghist_tblock_sizze_6305)),
+                    (sitofp_i32_f64(hist_u_6882 * hist_H_chk_6880) / hist_k_max_6881),
+                )
+                hist_M_6884 = np.int32(1)
+                num_subhistos_6815 = sext_i32_i64(hist_M_6884)
+                if hist_M_6884 == np.int32(1):
+                    defunc_0_map_res_subhistos_mem_6816 = mem_6758
+                else:
+                    if num_subhistos_6815 == np.int64(1):
+                        defunc_0_map_res_subhistos_mem_6816 = mem_6758
+                    else:
+                        defunc_0_map_res_subhistos_mem_6816 = opencl_alloc(
+                            self,
+                            ((num_subhistos_6815 * np.int64(256)) * np.int64(8)),
+                            "defunc_0_map_res_subhistos_mem_6816",
+                        )
+                        self.futhark_builtinzhreplicate_i64(
+                            defunc_0_map_res_subhistos_mem_6816,
+                            (num_subhistos_6815 * np.int64(256)),
+                            np.int64(0),
+                        )
+                        lmad_copy_gpu2gpu(
+                            self,
+                            ct.c_int64,
+                            defunc_0_map_res_subhistos_mem_6816,
+                            np.int64(0),
+                            [np.int64(1)],
+                            mem_6758,
+                            np.int64(0),
+                            [np.int64(1)],
+                            [np.int64(256)],
+                        )
+                chk_i_6885 = np.int32(0)
+                one_7033 = np.int32(1)
+                for counter_7032 in range(hist_S_6879):
+                    hist_H_chk_6886 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879))
+                    if (
+                        1
+                        * (
+                            np.int64(num_tblocks_6307)
+                            * self.sizes["byte_histogram.seghist_tblock_size_6304"]
+                        )
+                    ) != 0:
+                        self.byte_histogramziseghist_global_6312_var.set_args(
+                            cl.LocalMemory(max(np.int64(0), 1)),
+                            self.global_failure,
+                            ct.c_int64(n_5765),
+                            ct.c_int64(num_tblocks_6307),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int32(chk_i_6885),
+                            ct.c_int64(hist_H_chk_6886),
+                            xs_mem_6757,
+                            defunc_0_map_res_subhistos_mem_6816,
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.byte_histogramziseghist_global_6312_var,
+                            (
+                                (
+                                    np.int64(num_tblocks_6307)
+                                    * self.sizes["byte_histogram.seghist_tblock_size_6304"]
+                                ),
+                            ),
+                            (self.sizes["byte_histogram.seghist_tblock_size_6304"],),
+                        )
+                        if synchronous:
+                            sync(self)
+                    chk_i_6885 += one_7033
+            if num_subhistos_6815 == np.int64(1):
+                mem_6758 = defunc_0_map_res_subhistos_mem_6816
+            else:
+                chunk_sizze_6902 = np.int64(1)
+                if slt64(
+                    (num_subhistos_6815 * np.int64(2)),
+                    (seghist_tblock_sizze_6305 * chunk_sizze_6902),
+                ):
+                    segment_sizze_nonzzero_6903 = smax64(np.int64(1), num_subhistos_6815)
+                    num_threads_6904 = seghist_tblock_sizze_6305 * seghist_tblock_sizze_6305
+                    if (
+                        1
+                        * (
+                            np.int64(num_tblocks_6307)
+                            * self.sizes["byte_histogram.seghist_tblock_size_6304"]
+                        )
+                    ) != 0:
+                        self.byte_histogramzisegred_small_6901_var.set_args(
+                            cl.LocalMemory(
+                                max(
+                                    (
+                                        (np.int64(8) * seghist_tblock_sizze_6305)
+                                        + srem64(
+                                            (
+                                                np.int64(8)
+                                                - srem64(
+                                                    (np.int64(8) * seghist_tblock_sizze_6305),
+                                                    np.int64(8),
+                                                )
+                                            ),
+                                            np.int64(8),
+                                        )
+                                    ),
+                                    1,
+                                )
+                            ),
+                            self.global_failure,
+                            ct.c_int64(num_tblocks_6307),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int64(segment_sizze_nonzzero_6903),
+                            mem_6758,
+                            defunc_0_map_res_subhistos_mem_6816,
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.byte_histogramzisegred_small_6901_var,
+                            (
+                                (
+                                    np.int64(num_tblocks_6307)
+                                    * self.sizes["byte_histogram.seghist_tblock_size_6304"]
+                                ),
+                            ),
+                            (self.sizes["byte_histogram.seghist_tblock_size_6304"],),
+                        )
+                        if synchronous:
+                            sync(self)
+                else:
+                    blocks_per_segment_6932 = sdiv_up64(
+                        num_tblocks_6307, smax64(np.int64(1), np.int64(256))
+                    )
+                    q_6933 = sdiv_up64(
+                        num_subhistos_6815,
+                        ((seghist_tblock_sizze_6305 * blocks_per_segment_6932) * chunk_sizze_6902),
+                    )
+                    num_virtblocks_6934 = blocks_per_segment_6932 * np.int64(256)
+                    threads_per_segment_6935 = blocks_per_segment_6932 * seghist_tblock_sizze_6305
+                    segred_tmp_mem_6936 = opencl_alloc(
+                        self, (np.int64(8) * num_virtblocks_6934), "segred_tmp_mem_6936"
+                    )
+                    if (
+                        1
+                        * (
+                            np.int64(num_tblocks_6307)
+                            * self.sizes["byte_histogram.seghist_tblock_size_6304"]
+                        )
+                    ) != 0:
+                        self.byte_histogramzisegred_large_6901_var.set_args(
+                            cl.LocalMemory(
+                                max(
+                                    (
+                                        np.int32(8)
+                                        + (
+                                            (np.int64(8) * seghist_tblock_sizze_6305)
+                                            + srem64(
+                                                (
+                                                    np.int64(8)
+                                                    - srem64(
+                                                        (np.int64(8) * seghist_tblock_sizze_6305),
+                                                        np.int64(8),
+                                                    )
+                                                ),
+                                                np.int64(8),
+                                            )
+                                        )
+                                    ),
+                                    1,
+                                )
+                            ),
+                            self.global_failure,
+                            ct.c_int64(num_tblocks_6307),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int64(blocks_per_segment_6932),
+                            ct.c_int64(q_6933),
+                            ct.c_int64(num_virtblocks_6934),
+                            ct.c_int64(threads_per_segment_6935),
+                            mem_6758,
+                            defunc_0_map_res_subhistos_mem_6816,
+                            segred_tmp_mem_6936,
+                            self.constants["counters_mem_6938"],
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.byte_histogramzisegred_large_6901_var,
+                            (
+                                (
+                                    np.int64(num_tblocks_6307)
+                                    * self.sizes["byte_histogram.seghist_tblock_size_6304"]
+                                ),
+                            ),
+                            (self.sizes["byte_histogram.seghist_tblock_size_6304"],),
+                        )
+                        if synchronous:
+                            sync(self)
+        mem_out_6793 = mem_6758
+        return mem_out_6793
+
+    def futhark_entry_chunked_entropy(self, xs_mem_6757, n_6046, chunk_sizze_6047):
+        zzero_6194 = chunk_sizze_6047 == np.int64(0)
+        nonzzero_6195 = not (zzero_6194)
+        nonzzero_cert_6196 = True
+        assert (
+            nonzzero_6195
+        ), "Error: {}\n\nBacktrace:\n-> #0  ofrak_gpu/entropy.fut:12:9-23\n   #1  ofrak_gpu/entropy.fut:11:1-13:93\n".format(
+            "division by zero"
+        )
+        dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 = sdiv64(n_6046, chunk_sizze_6047)
+        bounds_invalid_upwards_6198 = slt64(
+            dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, np.int64(0)
+        )
+        valid_6199 = not (bounds_invalid_upwards_6198)
+        range_valid_c_6200 = True
+        assert valid_6199, (
+            "Error: %s%d%s%d%s\n\nBacktrace:\n-> #0  ofrak_gpu/entropy.fut:12:4-24\n   #1  ofrak_gpu/entropy.fut:11:1-13:93\n"
+            % (
+                "Range ",
+                np.int64(0),
+                "..<",
+                dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197,
+                " is invalid.",
+            )
+        )
+        suff_outer_par_6346 = (
+            self.sizes["chunked_entropy.suff_outer_par_0"]
+            <= dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197
+        )
+        segmap_tblock_sizze_6600 = self.sizes["chunked_entropy.segmap_tblock_size_6577"]
+        segmap_tblock_sizze_6639 = self.sizes["chunked_entropy.segmap_tblock_size_6523"]
+        max_num_tblocks_6795 = self.sizes["chunked_entropy.segmap_num_tblocks_6525"]
+        num_tblocks_6640 = sext_i64_i32(
+            smax64(
+                np.int64(1),
+                smin64(
+                    sdiv_up64(
+                        dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6639
+                    ),
+                    max_num_tblocks_6795,
+                ),
+            )
+        )
+        nest_sizze_6662 = np.int64(256) * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197
+        segred_tblock_sizze_6663 = self.sizes["chunked_entropy.segred_tblock_size_6473"]
+        max_num_tblocks_6796 = self.sizes["chunked_entropy.segred_num_tblocks_6475"]
+        num_tblocks_6664 = sext_i64_i32(
+            smax64(
+                np.int64(1),
+                smin64(sdiv_up64(nest_sizze_6662, segred_tblock_sizze_6663), max_num_tblocks_6796),
+            )
+        )
+        segmap_tblock_sizze_6683 = self.sizes["chunked_entropy.segmap_tblock_size_6456"]
+        segmap_tblock_sizze_6401 = self.sizes["chunked_entropy.segmap_tblock_size_6350"]
+        binop_y_6768 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197 - np.int64(1)
+        binop_x_6770 = smax64(np.int64(0), binop_y_6768)
+        binop_y_6772 = np.int64(255) * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197
+        binop_y_6773 = smax64(np.int64(0), binop_y_6772)
+        binop_y_6774 = binop_x_6770 + binop_y_6773
+        binop_y_6775 = np.int64(1) + binop_y_6774
+        bytes_6776 = np.int64(8) * binop_y_6775
+        bytes_6779 = np.int64(4) * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197
+        shared_memory_capacity_6934 = self.max_shared_memory
+        if suff_outer_par_6346 and sle64(np.int64(0), shared_memory_capacity_6934):
+            segmap_usable_groups_6402 = sdiv_up64(
+                dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6401
+            )
+            mem_6791 = opencl_alloc(
+                self, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, "mem_6791"
+            )
+            virt_num_tblocks_6797 = sext_i64_i32(
+                sdiv_up64(
+                    dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6401
+                )
+            )
+            if (
+                1
+                * (
+                    np.int64(segmap_usable_groups_6402)
+                    * self.sizes["chunked_entropy.segmap_tblock_size_6350"]
+                )
+            ) != 0:
+                self.chunked_entropyzisegmap_6405_var.set_args(
+                    cl.LocalMemory(max(np.int64(0), 1)),
+                    self.global_failure,
+                    self.failure_is_an_option,
+                    self.global_failure_args,
+                    ct.c_int64(n_6046),
+                    ct.c_int64(chunk_sizze_6047),
+                    ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197),
+                    xs_mem_6757,
+                    mem_6791,
+                )
+                cl.enqueue_nd_range_kernel(
+                    self.queue,
+                    self.chunked_entropyzisegmap_6405_var,
+                    (
+                        (
+                            np.int64(segmap_usable_groups_6402)
+                            * self.sizes["chunked_entropy.segmap_tblock_size_6350"]
+                        ),
+                    ),
+                    (self.sizes["chunked_entropy.segmap_tblock_size_6350"],),
+                )
+                if synchronous:
+                    sync(self)
+            self.failure_is_an_option = np.int32(1)
+            ext_mem_6792 = mem_6791
+        else:
+            segmap_usable_groups_6601 = sdiv_up64(
+                dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6600
+            )
+            mem_6759 = opencl_alloc(self, np.int64(0), "mem_6759")
+            virt_num_tblocks_6809 = sext_i64_i32(
+                sdiv_up64(
+                    dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6600
+                )
+            )
+            if (
+                1
+                * (
+                    np.int64(segmap_usable_groups_6601)
+                    * self.sizes["chunked_entropy.segmap_tblock_size_6577"]
+                )
+            ) != 0:
+                self.chunked_entropyzisegmap_6606_var.set_args(
+                    cl.LocalMemory(max(np.int64(0), 1)),
+                    self.global_failure,
+                    self.failure_is_an_option,
+                    self.global_failure_args,
+                    ct.c_int64(n_6046),
+                    ct.c_int64(chunk_sizze_6047),
+                    ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197),
+                    mem_6759,
+                )
+                cl.enqueue_nd_range_kernel(
+                    self.queue,
+                    self.chunked_entropyzisegmap_6606_var,
+                    (
+                        (
+                            np.int64(segmap_usable_groups_6601)
+                            * self.sizes["chunked_entropy.segmap_tblock_size_6577"]
+                        ),
+                    ),
+                    (self.sizes["chunked_entropy.segmap_tblock_size_6577"],),
+                )
+                if synchronous:
+                    sync(self)
+            self.failure_is_an_option = np.int32(1)
+            mem_6777 = opencl_alloc(self, bytes_6776, "mem_6777")
+            virt_num_tblocks_6818 = sext_i64_i32(
+                sdiv_up64(
+                    dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6639
+                )
+            )
+            if (
+                1
+                * (
+                    np.int64(num_tblocks_6640)
+                    * self.sizes["chunked_entropy.segmap_tblock_size_6523"]
+                )
+            ) != 0:
+                self.chunked_entropyzisegmap_6645_var.set_args(
+                    cl.LocalMemory(max(np.int64(0), 1)),
+                    self.global_failure,
+                    ct.c_int64(chunk_sizze_6047),
+                    ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197),
+                    ct.c_int64(num_tblocks_6640),
+                    ct.c_int32(virt_num_tblocks_6818),
+                    xs_mem_6757,
+                    mem_6759,
+                    mem_6777,
+                )
+                cl.enqueue_nd_range_kernel(
+                    self.queue,
+                    self.chunked_entropyzisegmap_6645_var,
+                    (
+                        (
+                            np.int64(num_tblocks_6640)
+                            * self.sizes["chunked_entropy.segmap_tblock_size_6523"]
+                        ),
+                    ),
+                    (self.sizes["chunked_entropy.segmap_tblock_size_6523"],),
+                )
+                if synchronous:
+                    sync(self)
+            mem_6759 = None
+            mem_6780 = opencl_alloc(self, bytes_6779, "mem_6780")
+            chunk_sizze_6833 = np.int64(1)
+            if slt64(np.int64(512), (segred_tblock_sizze_6663 * chunk_sizze_6833)):
+                segment_sizze_nonzzero_6834 = smax64(np.int64(1), np.int64(256))
+                num_threads_6835 = segred_tblock_sizze_6663 * segred_tblock_sizze_6663
+                if (
+                    1
+                    * (
+                        np.int64(num_tblocks_6664)
+                        * self.sizes["chunked_entropy.segred_tblock_size_6473"]
+                    )
+                ) != 0:
+                    self.chunked_entropyzisegred_small_6669_var.set_args(
+                        cl.LocalMemory(
+                            max(
+                                (
+                                    (np.int64(4) * segred_tblock_sizze_6663)
+                                    + srem64(
+                                        (
+                                            np.int64(8)
+                                            - srem64(
+                                                (np.int64(4) * segred_tblock_sizze_6663),
+                                                np.int64(8),
+                                            )
+                                        ),
+                                        np.int64(8),
+                                    )
+                                ),
+                                1,
+                            )
+                        ),
+                        self.global_failure,
+                        ct.c_int64(chunk_sizze_6047),
+                        ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197),
+                        ct.c_int64(num_tblocks_6664),
+                        ct.c_int64(segment_sizze_nonzzero_6834),
+                        mem_6777,
+                        mem_6780,
+                    )
+                    cl.enqueue_nd_range_kernel(
+                        self.queue,
+                        self.chunked_entropyzisegred_small_6669_var,
+                        (
+                            (
+                                np.int64(num_tblocks_6664)
+                                * self.sizes["chunked_entropy.segred_tblock_size_6473"]
+                            ),
+                        ),
+                        (self.sizes["chunked_entropy.segred_tblock_size_6473"],),
+                    )
+                    if synchronous:
+                        sync(self)
+            else:
+                blocks_per_segment_6862 = sdiv_up64(
+                    num_tblocks_6664,
+                    smax64(np.int64(1), dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197),
+                )
+                q_6863 = sdiv_up64(
+                    np.int64(256),
+                    ((segred_tblock_sizze_6663 * blocks_per_segment_6862) * chunk_sizze_6833),
+                )
+                num_virtblocks_6864 = (
+                    blocks_per_segment_6862 * dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197
+                )
+                threads_per_segment_6865 = blocks_per_segment_6862 * segred_tblock_sizze_6663
+                segred_tmp_mem_6866 = opencl_alloc(
+                    self, (np.int64(4) * num_virtblocks_6864), "segred_tmp_mem_6866"
+                )
+                if (
+                    1
+                    * (
+                        np.int64(num_tblocks_6664)
+                        * self.sizes["chunked_entropy.segred_tblock_size_6473"]
+                    )
+                ) != 0:
+                    self.chunked_entropyzisegred_large_6669_var.set_args(
+                        cl.LocalMemory(
+                            max(
+                                (
+                                    np.int32(8)
+                                    + (
+                                        (np.int64(4) * segred_tblock_sizze_6663)
+                                        + srem64(
+                                            (
+                                                np.int64(8)
+                                                - srem64(
+                                                    (np.int64(4) * segred_tblock_sizze_6663),
+                                                    np.int64(8),
+                                                )
+                                            ),
+                                            np.int64(8),
+                                        )
+                                    )
+                                ),
+                                1,
+                            )
+                        ),
+                        self.global_failure,
+                        ct.c_int64(chunk_sizze_6047),
+                        ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197),
+                        ct.c_int64(num_tblocks_6664),
+                        ct.c_int64(blocks_per_segment_6862),
+                        ct.c_int64(q_6863),
+                        ct.c_int64(num_virtblocks_6864),
+                        ct.c_int64(threads_per_segment_6865),
+                        mem_6777,
+                        mem_6780,
+                        segred_tmp_mem_6866,
+                        self.constants["counters_mem_6868"],
+                    )
+                    cl.enqueue_nd_range_kernel(
+                        self.queue,
+                        self.chunked_entropyzisegred_large_6669_var,
+                        (
+                            (
+                                np.int64(num_tblocks_6664)
+                                * self.sizes["chunked_entropy.segred_tblock_size_6473"]
+                            ),
+                        ),
+                        (self.sizes["chunked_entropy.segred_tblock_size_6473"],),
+                    )
+                    if synchronous:
+                        sync(self)
+            mem_6777 = None
+            segmap_usable_groups_6684 = sdiv_up64(
+                dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6683
+            )
+            mem_6782 = opencl_alloc(
+                self, dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, "mem_6782"
+            )
+            virt_num_tblocks_6925 = sext_i64_i32(
+                sdiv_up64(
+                    dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197, segmap_tblock_sizze_6683
+                )
+            )
+            if (
+                1
+                * (
+                    np.int64(segmap_usable_groups_6684)
+                    * self.sizes["chunked_entropy.segmap_tblock_size_6456"]
+                )
+            ) != 0:
+                self.chunked_entropyzisegmap_6687_var.set_args(
+                    cl.LocalMemory(max(np.int64(0), 1)),
+                    self.global_failure,
+                    ct.c_int64(chunk_sizze_6047),
+                    ct.c_int64(dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197),
+                    mem_6780,
+                    mem_6782,
+                )
+                cl.enqueue_nd_range_kernel(
+                    self.queue,
+                    self.chunked_entropyzisegmap_6687_var,
+                    (
+                        (
+                            np.int64(segmap_usable_groups_6684)
+                            * self.sizes["chunked_entropy.segmap_tblock_size_6456"]
+                        ),
+                    ),
+                    (self.sizes["chunked_entropy.segmap_tblock_size_6456"],),
+                )
+                if synchronous:
+                    sync(self)
+            mem_6780 = None
+            ext_mem_6792 = mem_6782
+        mem_out_6793 = ext_mem_6792
+        prim_out_6794 = dzlz7bUZLZLzsZRz20Unz20Uchunk_sizzeZRz7dUzg_6197
+        return (mem_out_6793, prim_out_6794)
+
+    def futhark_entry_entropy(self, xs_mem_6757, n_5907):
+        mem_6758 = opencl_alloc(self, np.int64(2048), "mem_6758")
+        self.futhark_builtinzhreplicate_i64(mem_6758, np.int64(256), np.int64(0))
+        seghist_tblock_sizze_6321 = self.sizes["entropy.seghist_tblock_size_6320"]
+        max_num_tblocks_6814 = self.sizes["entropy.seghist_num_tblocks_6322"]
+        num_tblocks_6323 = sext_i64_i32(
+            smax64(
+                np.int64(1),
+                smin64(sdiv_up64(n_5907, seghist_tblock_sizze_6321), max_num_tblocks_6814),
+            )
+        )
+        h_6818 = np.int64(2048)
+        seg_h_6819 = np.int64(2048)
+        if seg_h_6819 == np.int64(0):
+            pass
+        else:
+            hist_H_6820 = np.int64(256)
+            hist_el_sizze_6821 = sdiv_up64(h_6818, hist_H_6820)
+            hist_N_6822 = n_5907
+            hist_RF_6823 = np.int32(1)
+            hist_L_6824 = self.max_shared_memory
+            max_tblock_sizze_6825 = self.max_thread_block_size
+            num_tblocks_6826 = sdiv_up64(
+                sext_i32_i64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)),
+                max_tblock_sizze_6825,
+            )
+            hist_m_prime_6827 = sitofp_i64_f64(
+                smin64(
+                    squot64(hist_L_6824, hist_el_sizze_6821),
+                    sdiv_up64(hist_N_6822, num_tblocks_6826),
+                )
+            ) / sitofp_i64_f64(hist_H_6820)
+            hist_M0_6828 = smax64(
+                np.int64(1), smin64(fptosi_f64_i64(hist_m_prime_6827), max_tblock_sizze_6825)
+            )
+            hist_Nout_6829 = np.int64(1)
+            hist_Nin_6830 = n_5907
+            work_asymp_M_max_6831 = squot64(
+                (hist_Nout_6829 * hist_N_6822), ((np.int64(2) * num_tblocks_6826) * hist_H_6820)
+            )
+            hist_M_6832 = sext_i64_i32(smin64(hist_M0_6828, work_asymp_M_max_6831))
+            hist_C_6833 = sdiv_up64(
+                max_tblock_sizze_6825, sext_i32_i64(smax32(np.int32(1), hist_M_6832))
+            )
+            local_mem_needed_6834 = hist_el_sizze_6821 * sext_i32_i64(hist_M_6832)
+            hist_S_6835 = sext_i64_i32(
+                sdiv_up64(((hist_H_6820 * local_mem_needed_6834) + np.int64(1)), hist_L_6824)
+            )
+            if sle64(hist_H_6820, hist_Nin_6830) and (
+                sle64(local_mem_needed_6834, hist_L_6824)
+                and (
+                    sle32(hist_S_6835, np.int32(3))
+                    and (
+                        sle64(hist_C_6833, max_tblock_sizze_6825)
+                        and slt32(np.int32(0), hist_M_6832)
+                    )
+                )
+            ):
+                num_subhistos_6815 = num_tblocks_6826
+                if num_subhistos_6815 == np.int64(1):
+                    defunc_0_map_res_subhistos_mem_6816 = mem_6758
+                else:
+                    defunc_0_map_res_subhistos_mem_6816 = opencl_alloc(
+                        self,
+                        ((num_subhistos_6815 * np.int64(256)) * np.int64(8)),
+                        "defunc_0_map_res_subhistos_mem_6816",
+                    )
+                    self.futhark_builtinzhreplicate_i64(
+                        defunc_0_map_res_subhistos_mem_6816,
+                        (num_subhistos_6815 * np.int64(256)),
+                        np.int64(0),
+                    )
+                    lmad_copy_gpu2gpu(
+                        self,
+                        ct.c_int64,
+                        defunc_0_map_res_subhistos_mem_6816,
+                        np.int64(0),
+                        [np.int64(1)],
+                        mem_6758,
+                        np.int64(0),
+                        [np.int64(1)],
+                        [np.int64(256)],
+                    )
+                chk_i_6836 = np.int32(0)
+                one_7035 = np.int32(1)
+                for counter_7034 in range(hist_S_6835):
+                    num_segments_6837 = np.int64(1)
+                    hist_H_chk_6838 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6835))
+                    histo_sizze_6839 = hist_H_chk_6838
+                    init_per_thread_6840 = sext_i64_i32(
+                        sdiv_up64(
+                            (sext_i32_i64(hist_M_6832) * histo_sizze_6839), max_tblock_sizze_6825
+                        )
+                    )
+                    if (1 * (np.int64(num_tblocks_6826) * self.max_thread_block_size)) != 0:
+                        self.entropyziseghist_local_6328_var.set_args(
+                            cl.LocalMemory(
+                                max(
+                                    (
+                                        (np.int64(8) * (hist_M_6832 * hist_H_chk_6838))
+                                        + srem64(
+                                            (
+                                                np.int64(8)
+                                                - srem64(
+                                                    (np.int64(8) * (hist_M_6832 * hist_H_chk_6838)),
+                                                    np.int64(8),
+                                                )
+                                            ),
+                                            np.int64(8),
+                                        )
+                                    ),
+                                    1,
+                                )
+                            ),
+                            self.global_failure,
+                            ct.c_int64(n_5907),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int64(num_tblocks_6826),
+                            ct.c_int32(hist_M_6832),
+                            ct.c_int32(chk_i_6836),
+                            ct.c_int64(num_segments_6837),
+                            ct.c_int64(hist_H_chk_6838),
+                            ct.c_int64(histo_sizze_6839),
+                            ct.c_int32(init_per_thread_6840),
+                            xs_mem_6757,
+                            defunc_0_map_res_subhistos_mem_6816,
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.entropyziseghist_local_6328_var,
+                            ((np.int64(num_tblocks_6826) * self.max_thread_block_size),),
+                            (self.max_thread_block_size,),
+                        )
+                        if synchronous:
+                            sync(self)
+                    chk_i_6836 += one_7035
+            else:
+                hist_H_6872 = np.int64(256)
+                hist_RF_6873 = (np.float64(0.0) + sitofp_i32_f64(np.int64(1))) / np.float64(1.0)
+                hist_el_sizze_6874 = np.int32(8)
+                hist_C_max_6875 = fmin64(
+                    sitofp_i32_f64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)),
+                    (sitofp_i32_f64(hist_H_6872) / np.float64(2.0)),
+                )
+                hist_M_min_6876 = smax32(
+                    np.int32(1),
+                    sext_i64_i32(
+                        fptosi_f64_i64(
+                            sitofp_i32_f64(
+                                sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)
+                            )
+                            / hist_C_max_6875
+                        )
+                    ),
+                )
+                L2_sizze_6877 = self.max_cache
+                hist_RACE_exp_6878 = fmax64(
+                    np.float64(1.0),
+                    (
+                        (np.float64(0.75) * hist_RF_6873)
+                        / (np.float64(64.0) / sitofp_i32_f64(hist_el_sizze_6874))
+                    ),
+                )
+                if slt64(n_5907, hist_H_6872):
+                    hist_S_6879 = np.int32(1)
+                else:
+                    hist_S_6879 = sext_i64_i32(
+                        sdiv_up64(
+                            (
+                                (sext_i32_i64(hist_M_min_6876) * hist_H_6872)
+                                * sext_i32_i64(hist_el_sizze_6874)
+                            ),
+                            fptosi_f64_i64(
+                                (np.float64(0.4) * sitofp_i32_f64(L2_sizze_6877))
+                                * hist_RACE_exp_6878
+                            ),
+                        )
+                    )
+                hist_H_chk_6880 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879))
+                hist_k_max_6881 = fmin64(
+                    (
+                        (
+                            np.float64(0.4)
+                            * (sitofp_i32_f64(L2_sizze_6877) / sitofp_i32_f64(np.int32(8)))
+                        )
+                        * hist_RACE_exp_6878
+                    ),
+                    sitofp_i32_f64(n_5907),
+                ) / sitofp_i32_f64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321))
+                hist_u_6882 = np.int64(2)
+                hist_C_6883 = fmin64(
+                    sitofp_i32_f64(sext_i64_i32(num_tblocks_6323 * seghist_tblock_sizze_6321)),
+                    (sitofp_i32_f64(hist_u_6882 * hist_H_chk_6880) / hist_k_max_6881),
+                )
+                hist_M_6884 = np.int32(1)
+                num_subhistos_6815 = sext_i32_i64(hist_M_6884)
+                if hist_M_6884 == np.int32(1):
+                    defunc_0_map_res_subhistos_mem_6816 = mem_6758
+                else:
+                    if num_subhistos_6815 == np.int64(1):
+                        defunc_0_map_res_subhistos_mem_6816 = mem_6758
+                    else:
+                        defunc_0_map_res_subhistos_mem_6816 = opencl_alloc(
+                            self,
+                            ((num_subhistos_6815 * np.int64(256)) * np.int64(8)),
+                            "defunc_0_map_res_subhistos_mem_6816",
+                        )
+                        self.futhark_builtinzhreplicate_i64(
+                            defunc_0_map_res_subhistos_mem_6816,
+                            (num_subhistos_6815 * np.int64(256)),
+                            np.int64(0),
+                        )
+                        lmad_copy_gpu2gpu(
+                            self,
+                            ct.c_int64,
+                            defunc_0_map_res_subhistos_mem_6816,
+                            np.int64(0),
+                            [np.int64(1)],
+                            mem_6758,
+                            np.int64(0),
+                            [np.int64(1)],
+                            [np.int64(256)],
+                        )
+                chk_i_6885 = np.int32(0)
+                one_7037 = np.int32(1)
+                for counter_7036 in range(hist_S_6879):
+                    hist_H_chk_6886 = sdiv_up64(np.int64(256), sext_i32_i64(hist_S_6879))
+                    if (
+                        1
+                        * (
+                            np.int64(num_tblocks_6323)
+                            * self.sizes["entropy.seghist_tblock_size_6320"]
+                        )
+                    ) != 0:
+                        self.entropyziseghist_global_6328_var.set_args(
+                            cl.LocalMemory(max(np.int64(0), 1)),
+                            self.global_failure,
+                            ct.c_int64(n_5907),
+                            ct.c_int64(num_tblocks_6323),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int32(chk_i_6885),
+                            ct.c_int64(hist_H_chk_6886),
+                            xs_mem_6757,
+                            defunc_0_map_res_subhistos_mem_6816,
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.entropyziseghist_global_6328_var,
+                            (
+                                (
+                                    np.int64(num_tblocks_6323)
+                                    * self.sizes["entropy.seghist_tblock_size_6320"]
+                                ),
+                            ),
+                            (self.sizes["entropy.seghist_tblock_size_6320"],),
+                        )
+                        if synchronous:
+                            sync(self)
+                    chk_i_6885 += one_7037
+            if num_subhistos_6815 == np.int64(1):
+                mem_6758 = defunc_0_map_res_subhistos_mem_6816
+            else:
+                chunk_sizze_6902 = np.int64(1)
+                if slt64(
+                    (num_subhistos_6815 * np.int64(2)),
+                    (seghist_tblock_sizze_6321 * chunk_sizze_6902),
+                ):
+                    segment_sizze_nonzzero_6903 = smax64(np.int64(1), num_subhistos_6815)
+                    num_threads_6904 = seghist_tblock_sizze_6321 * seghist_tblock_sizze_6321
+                    if (
+                        1
+                        * (
+                            np.int64(num_tblocks_6323)
+                            * self.sizes["entropy.seghist_tblock_size_6320"]
+                        )
+                    ) != 0:
+                        self.entropyzisegred_small_6901_var.set_args(
+                            cl.LocalMemory(
+                                max(
+                                    (
+                                        (np.int64(8) * seghist_tblock_sizze_6321)
+                                        + srem64(
+                                            (
+                                                np.int64(8)
+                                                - srem64(
+                                                    (np.int64(8) * seghist_tblock_sizze_6321),
+                                                    np.int64(8),
+                                                )
+                                            ),
+                                            np.int64(8),
+                                        )
+                                    ),
+                                    1,
+                                )
+                            ),
+                            self.global_failure,
+                            ct.c_int64(num_tblocks_6323),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int64(segment_sizze_nonzzero_6903),
+                            mem_6758,
+                            defunc_0_map_res_subhistos_mem_6816,
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.entropyzisegred_small_6901_var,
+                            (
+                                (
+                                    np.int64(num_tblocks_6323)
+                                    * self.sizes["entropy.seghist_tblock_size_6320"]
+                                ),
+                            ),
+                            (self.sizes["entropy.seghist_tblock_size_6320"],),
+                        )
+                        if synchronous:
+                            sync(self)
+                else:
+                    blocks_per_segment_6932 = sdiv_up64(
+                        num_tblocks_6323, smax64(np.int64(1), np.int64(256))
+                    )
+                    q_6933 = sdiv_up64(
+                        num_subhistos_6815,
+                        ((seghist_tblock_sizze_6321 * blocks_per_segment_6932) * chunk_sizze_6902),
+                    )
+                    num_virtblocks_6934 = blocks_per_segment_6932 * np.int64(256)
+                    threads_per_segment_6935 = blocks_per_segment_6932 * seghist_tblock_sizze_6321
+                    segred_tmp_mem_6936 = opencl_alloc(
+                        self, (np.int64(8) * num_virtblocks_6934), "segred_tmp_mem_6936"
+                    )
+                    if (
+                        1
+                        * (
+                            np.int64(num_tblocks_6323)
+                            * self.sizes["entropy.seghist_tblock_size_6320"]
+                        )
+                    ) != 0:
+                        self.entropyzisegred_large_6901_var.set_args(
+                            cl.LocalMemory(
+                                max(
+                                    (
+                                        np.int32(8)
+                                        + (
+                                            (np.int64(8) * seghist_tblock_sizze_6321)
+                                            + srem64(
+                                                (
+                                                    np.int64(8)
+                                                    - srem64(
+                                                        (np.int64(8) * seghist_tblock_sizze_6321),
+                                                        np.int64(8),
+                                                    )
+                                                ),
+                                                np.int64(8),
+                                            )
+                                        )
+                                    ),
+                                    1,
+                                )
+                            ),
+                            self.global_failure,
+                            ct.c_int64(num_tblocks_6323),
+                            ct.c_int64(num_subhistos_6815),
+                            ct.c_int64(blocks_per_segment_6932),
+                            ct.c_int64(q_6933),
+                            ct.c_int64(num_virtblocks_6934),
+                            ct.c_int64(threads_per_segment_6935),
+                            mem_6758,
+                            defunc_0_map_res_subhistos_mem_6816,
+                            segred_tmp_mem_6936,
+                            self.constants["counters_mem_6938"],
+                        )
+                        cl.enqueue_nd_range_kernel(
+                            self.queue,
+                            self.entropyzisegred_large_6901_var,
+                            (
+                                (
+                                    np.int64(num_tblocks_6323)
+                                    * self.sizes["entropy.seghist_tblock_size_6320"]
+                                ),
+                            ),
+                            (self.sizes["entropy.seghist_tblock_size_6320"],),
+                        )
+                        if synchronous:
+                            sync(self)
+        i64_res_6254 = sitofp_i64_f32(n_5907)
+        segred_tblock_sizze_6337 = self.sizes["entropy.segred_tblock_size_6336"]
+        max_num_tblocks_6995 = self.sizes["entropy.segred_num_tblocks_6338"]
+        num_tblocks_6339 = sext_i64_i32(
+            smax64(
+                np.int64(1),
+                smin64(sdiv_up64(np.int64(256), segred_tblock_sizze_6337), max_num_tblocks_6995),
+            )
+        )
+        mem_6761 = opencl_alloc(self, np.int64(4), "mem_6761")
+        chunk_sizze_6996 = np.int64(1)
+        segred_tmp_mem_6999 = opencl_alloc(
+            self, (np.int64(4) * num_tblocks_6339), "segred_tmp_mem_6999"
+        )
+        num_threads_7001 = num_tblocks_6339 * segred_tblock_sizze_6337
+        if (1 * (np.int64(num_tblocks_6339) * self.sizes["entropy.segred_tblock_size_6336"])) != 0:
+            self.entropyzisegred_nonseg_6344_var.set_args(
+                cl.LocalMemory(
+                    max(
+                        (
+                            np.int32(8)
+                            + (
+                                (np.int64(4) * segred_tblock_sizze_6337)
+                                + srem64(
+                                    (
+                                        np.int64(8)
+                                        - srem64(
+                                            (np.int64(4) * segred_tblock_sizze_6337), np.int64(8)
+                                        )
+                                    ),
+                                    np.int64(8),
+                                )
+                            )
+                        ),
+                        1,
+                    )
+                ),
+                self.global_failure,
+                ct.c_float(i64_res_6254),
+                ct.c_int64(num_tblocks_6339),
+                ct.c_int64(num_threads_7001),
+                mem_6758,
+                mem_6761,
+                self.constants["counters_mem_6997"],
+                segred_tmp_mem_6999,
+            )
+            cl.enqueue_nd_range_kernel(
+                self.queue,
+                self.entropyzisegred_nonseg_6344_var,
+                ((np.int64(num_tblocks_6339) * self.sizes["entropy.segred_tblock_size_6336"]),),
+                (self.sizes["entropy.segred_tblock_size_6336"],),
+            )
+            if synchronous:
+                sync(self)
+        mem_6758 = None
+        read_res_7038 = np.empty(1, dtype=ct.c_float)
+        cl.enqueue_copy(
+            self.queue,
+            read_res_7038,
+            mem_6761,
+            src_offset=(np.int64(np.int64(0)) * 4),
+            is_blocking=synchronous,
+        )
+        sync(self)
+        defunc_0_f_res_6297 = read_res_7038[0]
+        mem_6761 = None
+        zs_lhs_6270 = np.float32(-1.0) * defunc_0_f_res_6297
+        log2_res_6272 = futhark_log2_32(i64_res_6254)
+        lifted_lambda_res_6273 = zs_lhs_6270 / log2_res_6272
+        prim_out_6793 = lifted_lambda_res_6273
+        return prim_out_6793
+
+    def byte_histogram(self, xs_mem_6757_ext):
+        n_5765 = None
+        try:
+            assert (type(xs_mem_6757_ext) in [np.ndarray, cl.array.Array]) and (
+                xs_mem_6757_ext.dtype == np.uint8
+            ), "Parameter has unexpected type"
+            if n_5765 == None:
+                n_5765 = np.int64(xs_mem_6757_ext.shape[0])
+            else:
+                assert (
+                    n_5765 == xs_mem_6757_ext.shape[0]
+                ), "Error: entry point arguments have invalid sizes."
+            if type(xs_mem_6757_ext) == cl.array.Array:
+                xs_mem_6757 = xs_mem_6757_ext.data
+            else:
+                xs_mem_6757 = opencl_alloc(self, np.int64(xs_mem_6757_ext.nbytes), "xs_mem_6757")
+                if np.int64(xs_mem_6757_ext.nbytes) != 0:
+                    cl.enqueue_copy(
+                        self.queue,
+                        xs_mem_6757,
+                        normaliseArray(xs_mem_6757_ext),
+                        is_blocking=synchronous,
+                    )
+        except (TypeError, AssertionError) as e:
+            raise TypeError(
+                "Argument #0 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format(
+                    "[]u8", type(xs_mem_6757_ext), xs_mem_6757_ext
+                )
+            )
+        time_start = time.time()
+        with np.errstate(divide="ignore", over="ignore", under="ignore", invalid="ignore"):
+            mem_out_6793 = self.futhark_entry_byte_histogram(xs_mem_6757, n_5765)
+        runtime = int(time.time() * 1000000) - int(time_start * 1000000)
+        sync(self)
+        return cl.array.Array(self.queue, (np.int64(256),), np.int64, data=mem_out_6793)
+
+    def chunked_entropy(self, chunk_sizze_6047_ext, xs_mem_6757_ext):
+        n_6046 = None
+        try:
+            chunk_sizze_6047 = np.int64(ct.c_int64(chunk_sizze_6047_ext))
+        except (TypeError, AssertionError) as e:
+            raise TypeError(
+                "Argument #0 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format(
+                    "i64", type(chunk_sizze_6047_ext), chunk_sizze_6047_ext
+                )
+            )
+        try:
+            assert (type(xs_mem_6757_ext) in [np.ndarray, cl.array.Array]) and (
+                xs_mem_6757_ext.dtype == np.uint8
+            ), "Parameter has unexpected type"
+            if n_6046 == None:
+                n_6046 = np.int64(xs_mem_6757_ext.shape[0])
+            else:
+                assert (
+                    n_6046 == xs_mem_6757_ext.shape[0]
+                ), "Error: entry point arguments have invalid sizes."
+            if type(xs_mem_6757_ext) == cl.array.Array:
+                xs_mem_6757 = xs_mem_6757_ext.data
+            else:
+                xs_mem_6757 = opencl_alloc(self, np.int64(xs_mem_6757_ext.nbytes), "xs_mem_6757")
+                if np.int64(xs_mem_6757_ext.nbytes) != 0:
+                    cl.enqueue_copy(
+                        self.queue,
+                        xs_mem_6757,
+                        normaliseArray(xs_mem_6757_ext),
+                        is_blocking=synchronous,
+                    )
+        except (TypeError, AssertionError) as e:
+            raise TypeError(
+                "Argument #1 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format(
+                    "[]u8", type(xs_mem_6757_ext), xs_mem_6757_ext
+                )
+            )
+        time_start = time.time()
+        with np.errstate(divide="ignore", over="ignore", under="ignore", invalid="ignore"):
+            (mem_out_6793, prim_out_6794) = self.futhark_entry_chunked_entropy(
+                xs_mem_6757, n_6046, chunk_sizze_6047
+            )
+        runtime = int(time.time() * 1000000) - int(time_start * 1000000)
+        sync(self)
+        return cl.array.Array(self.queue, (prim_out_6794,), np.uint8, data=mem_out_6793)
+
+    def entropy(self, xs_mem_6757_ext):
+        n_5907 = None
+        try:
+            assert (type(xs_mem_6757_ext) in [np.ndarray, cl.array.Array]) and (
+                xs_mem_6757_ext.dtype == np.uint8
+            ), "Parameter has unexpected type"
+            if n_5907 == None:
+                n_5907 = np.int64(xs_mem_6757_ext.shape[0])
+            else:
+                assert (
+                    n_5907 == xs_mem_6757_ext.shape[0]
+                ), "Error: entry point arguments have invalid sizes."
+            if type(xs_mem_6757_ext) == cl.array.Array:
+                xs_mem_6757 = xs_mem_6757_ext.data
+            else:
+                xs_mem_6757 = opencl_alloc(self, np.int64(xs_mem_6757_ext.nbytes), "xs_mem_6757")
+                if np.int64(xs_mem_6757_ext.nbytes) != 0:
+                    cl.enqueue_copy(
+                        self.queue,
+                        xs_mem_6757,
+                        normaliseArray(xs_mem_6757_ext),
+                        is_blocking=synchronous,
+                    )
+        except (TypeError, AssertionError) as e:
+            raise TypeError(
+                "Argument #0 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format(
+                    "[]u8", type(xs_mem_6757_ext), xs_mem_6757_ext
+                )
+            )
+        time_start = time.time()
+        with np.errstate(divide="ignore", over="ignore", under="ignore", invalid="ignore"):
+            prim_out_6793 = self.futhark_entry_entropy(xs_mem_6757, n_5907)
+        runtime = int(time.time() * 1000000) - int(time_start * 1000000)
+        sync(self)
+        return np.float32(prim_out_6793)
diff --git a/ofrak_gpu/ofrak_gpu/run.py b/ofrak_gpu/ofrak_gpu/run_entropy.py
similarity index 100%
rename from ofrak_gpu/ofrak_gpu/run.py
rename to ofrak_gpu/ofrak_gpu/run_entropy.py