Skip to content

Commit

Permalink
Fix performance for CUDA >= 9.2 (master) (#1327)
Browse files Browse the repository at this point in the history
- sets `GT_CONSTEXPR` to `constexpr` for nvcc 
- introduces `const_ref` to fix performance for CUDA >= 9.2 (`T` for small data types, `T const&` for large data types)
  • Loading branch information
lukasm91 authored and anstaf committed Jul 8, 2019
1 parent 0de85fa commit 9bb2d64
Show file tree
Hide file tree
Showing 52 changed files with 3,433 additions and 3,405 deletions.
9 changes: 5 additions & 4 deletions include/gridtools/common/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "../meta/macros.hpp"
#include "../meta/repeat.hpp"
#include "defs.hpp"
#include "generic_metafunctions/const_ref.hpp"
#include "generic_metafunctions/utility.hpp"
#include "gt_assert.hpp"
#include "host_device.hpp"
Expand Down Expand Up @@ -117,13 +118,13 @@ namespace gridtools {
}

template <size_t I, typename T, size_t D>
static GT_FUNCTION GT_CONSTEXPR const T &get(const array<T, D> &arr) noexcept {
static GT_FUNCTION GT_CONSTEXPR const_ref<T> get(const array<T, D> &arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return arr.m_array[I];
}

template <size_t I, typename T, size_t D>
static GT_FUNCTION GT_CONSTEXPR T &&get(array<T, D> &&arr) noexcept {
static GT_FUNCTION GT_CONSTEXPR T get(array<T, D> &&arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return wstd::move(arr.m_array[I]);
}
Expand Down Expand Up @@ -187,13 +188,13 @@ namespace gridtools {
}

template <size_t I, typename T, size_t D>
GT_FUNCTION GT_CONSTEXPR const T &get(const array<T, D> &arr) noexcept {
GT_FUNCTION GT_CONSTEXPR const_ref<T> get(const array<T, D> &arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return arr.m_array[I];
}

template <size_t I, typename T, size_t D>
GT_FUNCTION GT_CONSTEXPR T &&get(array<T, D> &&arr) noexcept {
GT_FUNCTION GT_CONSTEXPR T get(array<T, D> &&arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return wstd::move(get<I>(arr));
}
Expand Down
4 changes: 0 additions & 4 deletions include/gridtools/common/defs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@
@brief global definitions
*/

#ifdef __CUDA_ARCH__
#define GT_CONSTEXPR
#else
#define GT_CONSTEXPR constexpr
#endif

#define GT_RESTRICT __restrict__

Expand Down
31 changes: 31 additions & 0 deletions include/gridtools/common/generic_metafunctions/const_ref.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* GridTools
*
* Copyright (c) 2014-2019, ETH Zurich
* All rights reserved.
*
* Please, refer to the LICENSE file in the root directory.
* SPDX-License-Identifier: BSD-3-Clause
*/

#pragma once

#include <type_traits>

#include "../../meta/macros.hpp"
#include "../../meta/type_traits.hpp"

namespace gridtools {
namespace lazy {
template <class T, class = void>
struct const_ref : std::add_lvalue_reference<std::add_const_t<T>> {};

template <class T>
struct const_ref<T,
std::enable_if_t<!std::is_reference<T>::value && std::is_trivially_copy_constructible<T>::value &&
sizeof(T) <= sizeof(std::add_pointer_t<T>)>> : std::add_const<T> {};
} // namespace lazy

template <class T>
using const_ref = typename lazy::const_ref<T>::type;
} // namespace gridtools
19 changes: 7 additions & 12 deletions include/gridtools/common/gt_assert.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* SPDX-License-Identifier: BSD-3-Clause
*/
#pragma once
#include <cassert>
#include <stdexcept>

/** \ingroup common
Expand All @@ -16,19 +17,13 @@
@{
*/

#ifdef __CUDACC__
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
// we take the cuda assert for arch greater than 2.x
#include <assert.h>
#else
#undef assert
#define assert(e)
#endif
#else
#include <cassert>
#endif

#ifdef __CUDA_ARCH__
#if __CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ == 2
// we define this macro to an empty string for CUDA 9.2 because in certain cases, CUDA 9.2 tries to compile device
// instantiations of certain constexpr function templates, which can lead to compile-time errors like "cannot use an
// entity undefined in device code".
#define __PRETTY_FUNCTION__ ""
#endif
#define GT_ASSERT_OR_THROW(cond, msg) assert(cond)
#else
#define GT_ASSERT_OR_THROW(cond, msg) \
Expand Down
9 changes: 5 additions & 4 deletions include/gridtools/common/pair.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <utility>

#include "defs.hpp"
#include "generic_metafunctions/const_ref.hpp"
#include "generic_metafunctions/utility.hpp"
#include "host_device.hpp"

Expand Down Expand Up @@ -129,30 +130,30 @@ namespace gridtools {
template <>
struct pair_get<0> {
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION const T1 &const_get(const pair<T1, T2> &p) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T1> const_get(const pair<T1, T2> &p) noexcept {
return p.first;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T1 &get(pair<T1, T2> &p) noexcept {
return p.first;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T1 &&move_get(pair<T1, T2> &&p) noexcept {
static GT_CONSTEXPR GT_FUNCTION T1 move_get(pair<T1, T2> &&p) noexcept {
return wstd::move(p.first);
}
};
template <>
struct pair_get<1> {
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION const T2 &const_get(const pair<T1, T2> &p) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T2> const_get(const pair<T1, T2> &p) noexcept {
return p.second;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T2 &get(pair<T1, T2> &p) noexcept {
return p.second;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T2 &&move_get(pair<T1, T2> &&p) noexcept {
static GT_CONSTEXPR GT_FUNCTION T2 move_get(pair<T1, T2> &&p) noexcept {
return wstd::move(p.second);
}
};
Expand Down
17 changes: 9 additions & 8 deletions include/gridtools/common/tuple.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "../meta/type_traits.hpp"
#include "defs.hpp"
#include "generic_metafunctions/const_ref.hpp"
#include "generic_metafunctions/utility.hpp"
#include "host_device.hpp"

Expand Down Expand Up @@ -50,7 +51,7 @@ namespace gridtools {

struct tuple_leaf_getter {
template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T const &get(tuple_leaf<I, T, false> const &obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T> get(tuple_leaf<I, T, false> const &obj) noexcept {
return obj.m_value;
}

Expand All @@ -60,12 +61,12 @@ namespace gridtools {
}

template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T &&get(tuple_leaf<I, T, false> &&obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION T get(tuple_leaf<I, T, false> &&obj) noexcept {
return static_cast<T &&>(get<I>(obj));
}

template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T const &get(tuple_leaf<I, T, true> const &obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T> get(tuple_leaf<I, T, true> const &obj) noexcept {
return obj;
}

Expand All @@ -75,7 +76,7 @@ namespace gridtools {
}

template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T &&get(tuple_leaf<I, T, true> &&obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION T get(tuple_leaf<I, T, true> &&obj) noexcept {
return static_cast<T &&>(obj);
}
};
Expand Down Expand Up @@ -171,7 +172,7 @@ namespace gridtools {
tuple &operator=(tuple const &) = default;
tuple &operator=(tuple &&) = default;

GT_CONSTEXPR GT_FUNCTION tuple(Ts const &... args) noexcept : m_impl(args...) {}
GT_CONSTEXPR GT_FUNCTION tuple(const_ref<Ts>... args) noexcept : m_impl(args...) {}

template <class... Args,
std::enable_if_t<sizeof...(Ts) == sizeof...(Args) &&
Expand Down Expand Up @@ -205,7 +206,7 @@ namespace gridtools {
T m_value;
struct getter {
template <size_t I, std::enable_if_t<I == 0, int> = 0>
static GT_CONSTEXPR GT_FUNCTION T const &get(tuple const &obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T> get(tuple const &obj) noexcept {
return obj.m_value;
}

Expand All @@ -215,7 +216,7 @@ namespace gridtools {
}

template <size_t I, std::enable_if_t<I == 0, int> = 0>
static GT_CONSTEXPR GT_FUNCTION T &&get(tuple &&obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION T get(tuple &&obj) noexcept {
return static_cast<T &&>(obj.m_value);
}
};
Expand All @@ -232,7 +233,7 @@ namespace gridtools {
tuple &operator=(tuple const &) = default;
tuple &operator=(tuple &&) = default;

GT_CONSTEXPR GT_FUNCTION tuple(T const &arg) noexcept : m_value(arg) {}
GT_CONSTEXPR GT_FUNCTION tuple(const_ref<T> arg) noexcept : m_value(arg) {}

template <class Arg, std::enable_if_t<std::is_constructible<T, Arg &&>::value, int> = 0>
GT_CONSTEXPR GT_FUNCTION tuple(Arg &&arg) noexcept : m_value(wstd::forward<Arg>(arg)) {}
Expand Down
32 changes: 15 additions & 17 deletions include/gridtools/common/tuple_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,7 @@ namespace gridtools {
enum class ref_kind { rvalue, lvalue, const_lvalue };

template <class>
struct get_ref_kind;

template <class T>
struct get_ref_kind<T &&> : std::integral_constant<ref_kind, ref_kind::rvalue> {};
struct get_ref_kind : std::integral_constant<ref_kind, ref_kind::rvalue> {};

template <class T>
struct get_ref_kind<T &> : std::integral_constant<ref_kind, ref_kind::lvalue> {};
Expand All @@ -269,7 +266,9 @@ namespace gridtools {
struct add_ref;

template <class T>
struct add_ref<ref_kind::rvalue, T> : std::add_rvalue_reference<T> {};
struct add_ref<ref_kind::rvalue, T> {
using type = T;
};

template <class T>
struct add_ref<ref_kind::lvalue, T> : std::add_lvalue_reference<T> {};
Expand Down Expand Up @@ -419,8 +418,7 @@ namespace gridtools {
template <class Tup,
class... Tups,
class Is = meta::make_indices<size<std::decay_t<Tup>>>,
class Res =
from_types<Tup, get_results_t<Is, get_accessors<Tup &&>, get_accessors<Tups &&>...>>>
class Res = from_types<Tup, get_results_t<Is, get_accessors<Tup>, get_accessors<Tups>...>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup, Tups &&... tups) const {
using generators_t = meta::transform<get_transform_index_generator, Is>;
return generate_f<generators_t, Res>{}(
Expand Down Expand Up @@ -517,7 +515,7 @@ namespace gridtools {
meta::make_indices_for<InnerTup>>;

template <class Tup,
class Accessors = meta::transform<get_accessors, get_accessors<Tup &&>>,
class Accessors = meta::transform<get_accessors, get_accessors<Tup>>,
class First = meta::first<to_types<Tup>>,
class Res = from_types<First, meta::flatten<Accessors>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
Expand All @@ -534,7 +532,7 @@ namespace gridtools {
using get_drop_front_generator = get_nth_f<N + I::value>;

template <class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::drop_front_c<N, Accessors>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
using generators =
Expand All @@ -558,7 +556,7 @@ namespace gridtools {
struct push_back_f {
template <class Tup,
class... Args,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::push_back<Accessors, Args &&...>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup, Args &&... args) const {
return push_back_impl_f<std::make_index_sequence<size<Accessors>::value>, Res>{}(
Expand All @@ -581,7 +579,7 @@ namespace gridtools {
struct push_front_f {
template <class Tup,
class... Args,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::push_front<Accessors, Args &&...>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup, Args &&... args) const {
return push_front_impl_f<std::make_index_sequence<size<Accessors>::value>, Res>{}(
Expand Down Expand Up @@ -634,7 +632,7 @@ namespace gridtools {
size_t N,
class State,
class Tup,
class AllAccessors = get_accessors<Tup &&>,
class AllAccessors = get_accessors<Tup>,
class Accessors = meta::drop_front_c<I, AllAccessors>,
class Res = meta::lfold<meta_fun, State &&, Accessors>,
std::enable_if_t<(I + 4 < N), int> = 0>
Expand All @@ -651,15 +649,15 @@ namespace gridtools {

template <class State,
class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = meta::lfold<meta_fun, State &&, Accessors>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(State &&state, Tup &&tup) const {
return impl<0, size<std::decay_t<Tup>>::value>(
wstd::forward<State>(state), wstd::forward<Tup>(tup));
}

template <class Tup,
class AllAccessors = get_accessors<Tup &&>,
class AllAccessors = get_accessors<Tup>,
class StateAccessor = meta::first<AllAccessors>,
class Accessors = meta::drop_front_c<1, AllAccessors>,
class Res = meta::lfold<meta_fun, StateAccessor, Accessors>>
Expand Down Expand Up @@ -753,7 +751,7 @@ namespace gridtools {

template <class Tup,
class First = meta::first<to_types<Tup>>,
class Accessors = meta::transform<get_accessors, get_accessors<Tup &&>>,
class Accessors = meta::transform<get_accessors, get_accessors<Tup>>,
class Types = meta::transpose<Accessors>,
class InnerTuples = meta::transform<get_inner_tuple_f<Tup>::template apply, Types>,
class Res = from_types<First, InnerTuples>>
Expand All @@ -774,7 +772,7 @@ namespace gridtools {
};

template <class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::reverse<Accessors>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
using n_t = size<std::decay_t<Tup>>;
Expand Down Expand Up @@ -813,7 +811,7 @@ namespace gridtools {
meta::if_c<I::value == N, insert_val_generator_f, insert_tup_generator_f<I::value - 1>>>;

template <class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Types = meta::insert_c<N, Accessors, Val>,
class Res = from_types<Tup, Types>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,19 @@ namespace gridtools {
return arg;
}

// intel compiler 18.0 segfaults if this is a value. On the other hand, nvcc performs much worse in the
// dycore if it is a lvalue reference
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER <= 1800)
template <class Eval, class Arg, std::enable_if_t<!std::is_arithmetic<Arg>::value, int> = 0>
GT_FUNCTION GT_CONSTEXPR decltype(auto) apply_eval(Eval &eval, Arg const &arg) {
return eval(arg);
}
#else
template <class Eval, class Arg, std::enable_if_t<!std::is_arithmetic<Arg>::value, int> = 0>
GT_FUNCTION GT_CONSTEXPR decltype(auto) apply_eval(Eval &eval, Arg arg) {
return eval(wstd::move(arg));
}
#endif

template <class Eval, class Op, class Arg>
GT_FUNCTION GT_CONSTEXPR auto value(Eval &eval, expr<Op, Arg> const &arg) {
Expand Down
Loading

0 comments on commit 9bb2d64

Please sign in to comment.