Skip to content

Commit

Permalink
Merge pull request #3828 from rhc54/cmr30/bele
Browse files Browse the repository at this point in the history
Detect that we have a mix of BE/LE in the system, provide a warning that OMPI doesn't currently support this environment, and error out
  • Loading branch information
bwbarrett authored Jul 7, 2017
2 parents b92a139 + 2f4b3ab commit fe41070
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 25 deletions.
17 changes: 1 addition & 16 deletions config/opal_configure_options.m4
Original file line number Diff line number Diff line change
Expand Up @@ -286,22 +286,7 @@ fi
AC_DEFINE_UNQUOTED(OPAL_ENABLE_DLOPEN_SUPPORT, $OPAL_ENABLE_DLOPEN_SUPPORT,
[Whether we want to enable dlopen support])

#
# Heterogeneous support
#

AC_MSG_CHECKING([if want heterogeneous support])
AC_ARG_ENABLE([heterogeneous],
[AC_HELP_STRING([--enable-heterogeneous],
[Enable features required for heterogeneous
platform support (default: disabled)])])
if test "$enable_heterogeneous" = "yes" ; then
AC_MSG_RESULT([yes])
opal_want_heterogeneous=1
else
AC_MSG_RESULT([no])
opal_want_heterogeneous=0
fi
opal_want_heterogeneous=0
AC_DEFINE_UNQUOTED([OPAL_ENABLE_HETEROGENEOUS_SUPPORT],
[$opal_want_heterogeneous],
[Enable features required for heterogeneous support])
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ AC_CACHE_SAVE
opal_show_title "Header file tests"

AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \
dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \
libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \
poll.h pthread.h pty.h pwd.h sched.h \
strings.h stropts.h linux/ethtool.h linux/sockios.h \
Expand Down
25 changes: 18 additions & 7 deletions opal/mca/hwloc/base/hwloc_base_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ENDIAN_H
#include <endian.h>
#endif

#include "opal/runtime/opal.h"
#include "opal/constants.h"
Expand Down Expand Up @@ -2163,7 +2166,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op
char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
{
int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt;
char *sig=NULL, *arch=NULL;
char *sig=NULL, *arch = NULL, *endian;
hwloc_obj_t obj;
unsigned i;

Expand All @@ -2183,14 +2186,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
break;
}
}

if (NULL == arch) {
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt);
} else {
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch);
arch = "unknown";
}

#ifdef __BYTE_ORDER
#if __BYTE_ORDER == __LITTLE_ENDIAN
endian = "le";
#else
endian = "be";
#endif
#else
endian = "unknown";
#endif

asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian);
return sig;
}

Expand Down
13 changes: 12 additions & 1 deletion orte/mca/plm/base/help-plm-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has
no active out-of-band transports and therefore cannot execute this call.
Please check to see if you have the "oob" MCA parameter set and ensure
that it is either unset or at least includes the tcp transport.
#
[multi-endian]
Open MPI does not currently support multi-endian operations. We have
detected that the following node differs in endianness:


Nodename: %s
Endian: %s
Local endian: %s

Please correct the situation and try again.
27 changes: 27 additions & 0 deletions orte/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -1055,12 +1055,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
int i;
bool found;
orte_daemon_cmd_flag_t cmd;
char *myendian;

/* get the daemon job, if necessary */
if (NULL == jdatorted) {
jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
}

/* get my endianness */
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
if (NULL == t) {
/* should never happen */
myendian = "unknown";
} else {
myendian = strrchr(t->sig, ':');
++myendian;
}

/* multiple daemons could be in this buffer, so unpack until we exhaust the data */
idx = 1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
Expand Down Expand Up @@ -1240,8 +1251,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
}
free(sig);
break;
} else {
/* check if the difference is due to the endianness */
ptr = strrchr(sig, ':');
++ptr;
if (0 != strcmp(ptr, myendian)) {
/* we don't currently handle multi-endian operations in the
* MPI support */
orte_show_help("help-plm-base", "multi-endian", true,
nodename, ptr, myendian);
orted_failed_launch = true;
if (NULL != topo) {
hwloc_topology_destroy(topo);
}
goto CLEANUP;
}
}
}

if (!found) {
/* nope - save the signature and request the complete topology from that node */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
Expand Down

0 comments on commit fe41070

Please sign in to comment.