Skip to content

Commit

Permalink
Enabling thread pool to be numa-aware (#13778)
Browse files Browse the repository at this point in the history
The PR enables ort thread pool to be numa-aware, so that threads could
be evenly created and distributed among numa nodes.
In addition, to facilitate performance tuning, the PR opens a new API
allowing customers to attach threads to certain logical processors.
Please check the API
[definition](https://github.com/microsoft/onnxruntime/pull/13778/files#diff-5845a5c76fb64abdc8f0cffe21b37f8da1712674eb3abc4cd87190891be1bd48)
for details.

Co-authored-by: Randy Shuai <rashuai@microsoft.com>
  • Loading branch information
RandySheriffH and RandyShuai authored Dec 12, 2022
1 parent b8d941f commit 75584c5
Show file tree
Hide file tree
Showing 27 changed files with 1,178 additions and 607 deletions.
2 changes: 1 addition & 1 deletion cmake/onnxruntime_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ target_include_directories(onnxruntime_common
${OPTIONAL_LITE_INCLUDE_DIR})


target_link_libraries(onnxruntime_common PUBLIC safeint_interface ${GSL_TARGET})
target_link_libraries(onnxruntime_common PUBLIC safeint_interface ${GSL_TARGET} ${ABSEIL_LIBS})

add_dependencies(onnxruntime_common ${onnxruntime_EXTERNAL_DEPENDENCIES})

Expand Down
5 changes: 5 additions & 0 deletions include/onnxruntime/core/common/logging/logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ class LoggingManager final {
*/
static const Logger& DefaultLogger();

/**
Return a boolean indicating if the default logger has been initialized
*/
static bool HasDefaultLogger() { return nullptr != s_default_logger_; }

/**
Change the minimum severity level for log messages to be output by the default logger.
@param severity The severity.
Expand Down
21 changes: 21 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3614,6 +3614,27 @@ struct OrtApi {
*/
ORT_API2_STATUS(UpdateEnvWithCustomLogLevel, _In_ OrtEnv* ort_env, OrtLoggingLevel log_severity_level);

/* \brief Set affinities for intra op threads
*
* Affinity string follows format:
* logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id
* Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to.
* e.g. 1,2,3;4,5
* specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th.
* To ease the configuration, an "interval" is also allowed:
* e.g. 1-8;8-16;17-24
* orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth.
* Note:
* 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1,
* ort does not set affinity on the main thread which is started and managed by the calling app;
* 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors,
* an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group.
* Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary.
*
* \since Version 1.14
*/
ORT_API2_STATUS(SetGlobalIntraOpThreadAffinity, _Inout_ OrtThreadingOptions* tp_options, const char* affinity_string);

#ifdef __cplusplus
OrtApi(const OrtApi&)=delete; // Prevent users from accidentally copying the API structure, it should always be passed as a pointer
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,20 @@ static const char* const kOrtSessionOptionsConfigForceSpinningStop = "session.fo
// "0": in some cases warnings will be logged but processing will continue. The default.
// May be useful to expose bugs in models.
static const char* const kOrtSessionOptionsConfigStrictShapeTypeInference = "session.strict_shape_type_inference";

// This Option allows setting affinities for intra op threads.
// Affinity string follows format:
// logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id
// Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to.
// e.g.1,2,3;4,5
// specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th.
// To ease the configuration, an "interval" is also allowed:
// e.g. 1-8;8-16;17-24
// orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth.
// Note:
// 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1, since ort does not set affinity on the main thread which
// is started and managed by the calling app;
// 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors,
// an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group.
// Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary.
static const char* const kOrtSessionOptionsConfigIntraOpThreadAffinities = "session.intra_op_thread_affinities";
6 changes: 3 additions & 3 deletions onnxruntime/core/common/threadpool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -383,10 +383,10 @@ ThreadPool::ThreadPool(Env* env,
if (degree_of_parallelism >= 2) {
int threads_to_create = degree_of_parallelism - 1;

if (!thread_options_.affinity.empty()) {
if (!thread_options_.affinities.empty()) {
// Remove first affinity element as designated for the caller thread
thread_options_.affinity.erase(thread_options_.affinity.begin());
assert(thread_options_.affinity.size() >= size_t(threads_to_create));
thread_options_.affinities.erase(thread_options_.affinities.begin());
assert(thread_options_.affinities.size() >= size_t(threads_to_create));
}

extended_eigen_threadpool_ =
Expand Down
6 changes: 4 additions & 2 deletions onnxruntime/core/framework/config_options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ Status ConfigOptions::AddConfigEntry(const char* config_key, const char* config_
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Config key is empty or longer than maximum length 128");

std::string val(config_value);
if (val.length() > 1024)
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Config value is longer than maximum length 1024");
if (val.length() > onnxruntime::kMaxStrLen)
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Config value is longer than maximum length: ",
onnxruntime::kMaxStrLen);

auto iter = configurations.find(config_key);
if (iter != configurations.cend()) {
Expand Down
5 changes: 2 additions & 3 deletions onnxruntime/core/platform/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ struct ThreadOptions {
// that are contained in a given physical core with the same index as the thread. ORT does not set any affinity
// to the thread that is considered main (the thread that initiates the creation of the TP).
// The process that owns the thread may consider setting its affinity.
std::vector<LogicalProcessors> affinity;
std::vector<LogicalProcessors> affinities;

// Set or unset denormal as zero.
bool set_denormal_as_zero = false;
Expand Down Expand Up @@ -139,8 +139,7 @@ class Env {
/// <returns>Number of physical cores</returns>
virtual int GetNumPhysicalCpuCores() const = 0;

// This function currently doesn't support systems with more than 64 logical processors on Windows
virtual std::vector<LogicalProcessors> GetThreadAffinityMasks() const = 0;
virtual std::vector<LogicalProcessors> GetDefaultThreadAffinities() const = 0;

/// \brief Returns the number of micro-seconds since the Unix epoch.
virtual uint64_t NowMicros() const {
Expand Down
23 changes: 16 additions & 7 deletions onnxruntime/core/platform/posix/env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,8 @@ class PosixThread : public EnvThread {
custom_join_thread_fn = thread_options.custom_join_thread_fn;

auto param_ptr = std::make_unique<Param>(name_prefix, index, start_address, param);
if (narrow<size_t>(index) < thread_options.affinity.size()) {
param_ptr->affinity = thread_options.affinity[index];
if (narrow<size_t>(index) < thread_options.affinities.size()) {
param_ptr->affinity = thread_options.affinities[index];
}

if (custom_create_thread_fn) {
Expand Down Expand Up @@ -233,12 +233,21 @@ class PosixThread : public EnvThread {
if (p->affinity.has_value() && !p->affinity->empty()) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
for(auto id : *p->affinity) {
CPU_SET(id, &cpuset);
for (auto id : *p->affinity) {
if (id > -1 && id < CPU_SETSIZE) {
CPU_SET(id, &cpuset);
} else {
// Logical processor id starts from 0 internally, but in ort API, it starts from 1,
// that's why id need to increase by 1 when logging.
LOGS_DEFAULT(ERROR) << "cpu " << id + 1 << " does not exist, skipping it for affinity setting";
}
}
// pthread_setaffinity_np() does not set errno, it returns it.
auto ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
if (ret != 0) {
if (0 == ret) {
LOGS_DEFAULT(VERBOSE) << "pthread_setaffinity_np succeed for thread: " << syscall(SYS_gettid)
<< ", index: " << p->index
<< ", mask: " << *p->affinity;
} else {
auto [err_no, err_msg] = GetSystemError(ret);
LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << syscall(SYS_gettid)
<< ", index: " << p->index
Expand Down Expand Up @@ -290,7 +299,7 @@ class PosixEnv : public Env {
return DefaultNumCores();
}

std::vector<LogicalProcessors> GetThreadAffinityMasks() const override {
std::vector<LogicalProcessors> GetDefaultThreadAffinities() const override {

std::vector<LogicalProcessors> ret;
#ifdef ORT_USE_CPUINFO
Expand Down
Loading

0 comments on commit 75584c5

Please sign in to comment.