Merge pull request #207 from ludwig-cf/develop

Release 0.18.0
ludwig-cf · Sep 13, 2022 · 73dab7d · 73dab7d
2 parents 720a794 + c93e10d
commit 73dab7d
Show file tree

Hide file tree

Showing 188 changed files with 2,525 additions and 4,659 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,17 @@
 language: c
+cache:
+  - directories:
+    - mpi
+before_install:
+  - bash ./config/build-mpi.sh
 script:
   - cp config/travis-gcc.mk ./config.mk
   - make serial
   - make
   - export OMP_NUM_THREADS=1
   - make test
+  - make clean
+  - export PATH=$(pwd)/mpi/bin:${PATH}
+  - cp config/travis-mpicc.mk ./config.mk
+  - make
+  - make unit
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,28 @@
 
 ### Changes
 
+version 0.18.0
+
+- Added a lubrication correction offset to allow an option for keeping
+  particles clear of plane walls.
+  See https://ludwig.epcc.ed.ac.uk/inputs/colloid.html
+- Added options for arranging 'first touch' on allocation of memory
+  for LB data and field componenents.
+  See https://ludwig.epcc.ed.ac.uk/inputs/parallel.html
+
+- Various minor code improvements.
+
+version 0.17.2
+
+- Bug fix (issue #204) prevent crashes with s7_anchoring related to
+  proximity of wall/colloid or colloid/colloid. Advice added to
+  documentation on avoiding such close approaches.
+
+version 0.17.1
+
+- Bug fix (issue #197). The liquid crystal reduced field strength was reported
+  incorrectly in the output (always zero). Thanks to Oliver H. for spotting.
+
 version 0.17.0
 
 - add liquid crystal anchoring "fd_gradient_calculation s7_anchoring"

diff --git a/Makefile b/Makefile
@@ -30,9 +30,12 @@ test:
 	$(MAKE) -C target test
 	$(MAKE) -C tests test
 
+unit:
+	$(MAKE) -C tests/unit test
+
 clean:
 	$(MAKE) -C mpi_s clean
 	$(MAKE) -C target clean
 	$(MAKE) -C src clean
 	$(MAKE) -C tests clean
-	$(MAKE) -C util clean
+	$(MAKE) -C util clean
diff --git a/README.md b/README.md
@@ -18,8 +18,8 @@ in a free energy picture.
 The code is written in standard ANSI C, and uses the Message Passing
 Interface for distributed memory parallelism. Threaded parallelism is
 also available via a lightweight abstraction layer ("Target Data Parallel"
-or "TargetDP") which currently supports either OpenMP or CUDA (NVIDIA GPUs)
-from a single source.
+or "TargetDP") which currently supports OpenMP, CUDA (NVIDIA GPUs) or
+HIP (AMD GPUs) from a single source.
 
 #### Installation
 

diff --git a/config/build-mpi.sh b/config/build-mpi.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+
+printf "MPI installation script at %s\n" "$(date)"
+
+my_prefix=$(pwd)/mpi
+
+if [ -f mpi/bin/mpicc ]; then
+    export PATH=${my_prefix}/bin:${PATH}
+    printf "Added existing local mpi to path:\n"
+    printf "PATH is %s\n" "${PATH}"
+    which mpicc
+else
+    printf "%s %s\n" "Install MPI to prefix" "${my_prefix}"
+    wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.gz
+    tar xf openmpi-4.1.4.tar.gz
+    cd openmpi-4.1.4
+    ./configure CC=gcc CXX=g++    \
+                --enable-mpi-fortran=no \
+                --prefix=${my_prefix}
+    make -j 2
+    make install
+    cd -
+    rm -rf openmpi-4.1.4.tar.gz
+    rm -rf openmpi-4.1.4
+fi
diff --git a/config/travis-mpicc.mk b/config/travis-mpicc.mk
@@ -0,0 +1,25 @@
+##############################################################################
+#
+#  travis-mpicc.mk
+#
+#  Parallel unit tests
+#
+##############################################################################
+
+BUILD   = parallel
+MODEL   = -D_D3Q19_
+
+CC      = mpicc -fopenmp
+CFLAGS  = -O2 -g -Wall -Werror
+
+AR      = ar
+ARFLAGS = -cru
+LDFLAGS =
+
+LAUNCH_SERIAL_CMD =
+LAUNCH_MPIRUN_CMD = mpirun --oversubscribe
+MPIRUN_NTASK_FLAG = -np
+
+# Unit tests only
+MPIRUN_NTASK_UNIT = 4
+
diff --git a/docs/tutorial/tutorial.tex b/docs/tutorial/tutorial.tex
@@ -288,7 +288,7 @@ \subsection{Test 2: Spinodal Decomposition of a Binary Fluid}
 raw data files and time steps (with appropriate start, increment and end in the \texttt{for} loop):
 \begin{lstlisting}
 #!/bin/bash
-for i in `seq 1000 1000 10000`;
+for i in $(seq 1000 1000 10000);
 do
   tstep=$(printf "%08d" $i)
   ./extract -k vel.001-001.meta vel-${tstep}.001-001  
@@ -429,7 +429,7 @@ \subsection{Test 3: Colloids Dispersed in a Liquid Crystalline Fluid}
 \item Again, a bash script like this one below can be used for convenient post-processing of multiple raw data files and time steps (with appropriate start, increment and end in the \texttt{for} loop):
 \begin{lstlisting}
 #!/bin/bash
-for i in `seq 1000 1000 10000`;
+for i in $(seq 1000 1000 10000);
 do
   tstep=$(printf "%08d" $i)
   ./extract -k vel.001-001.meta vel-${tstep}.001-001  

diff --git a/src/blue_phase.c b/src/blue_phase.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2011-2021 The University of Edinburgh
+ *  (c) 2011-2022 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -209,21 +209,17 @@ __host__ int fe_lc_target(fe_lc_t * fe, fe_t ** target) {
 
 __host__ int fe_lc_param_commit(fe_lc_t * fe) {
 
-  int ia;
   double e0_freq, t;
-  double e0[3];
   physics_t * phys = NULL;
   PI_DOUBLE(pi);
 
   assert(fe);
 
   physics_ref(&phys);
-  physics_e0(phys, e0);
   physics_e0_frequency(phys, &e0_freq);
   physics_control_time(phys, &t);
-  for (ia = 0; ia < 3; ia++) {
-    fe->param->e0coswt[ia] = cos(2.0*pi*e0_freq*t)*e0[ia];
-  }
+
+  fe->param->coswt = cos(2.0*pi*e0_freq*t);
 
   tdpMemcpyToSymbol(tdpSymbol(const_param), fe->param, sizeof(fe_lc_param_t),
 		    0, tdpMemcpyHostToDevice);
@@ -388,8 +384,10 @@ __host__ __device__ int fe_lc_compute_fed(fe_lc_t * fe, double gamma,
 
   efield = 0.0;
   for (ia = 0; ia < 3; ia++) {
+    double ea = fe->param->e0[ia]*fe->param->coswt;
     for (ib = 0; ib < 3; ib++) {
-      efield += fe->param->e0coswt[ia]*q[ia][ib]*fe->param->e0coswt[ib];
+      double eb = fe->param->e0[ib]*fe->param->coswt;
+      efield += ea*q[ia][ib]*eb;
     }
   }
 
@@ -1094,13 +1092,15 @@ int fe_lc_compute_h(fe_lc_t * fe, double gamma, double q[3][3],
 
   e2 = 0.0;
   for (ia = 0; ia < 3; ia++) {
-    e2 += fe->param->e0coswt[ia]*fe->param->e0coswt[ia];
+    double ea = fe->param->e0[ia]*fe->param->coswt;
+    e2 += ea*ea;
   }
 
   for (ia = 0; ia < 3; ia++) {
+    double ea = fe->param->e0[ia]*fe->param->coswt;
     for (ib = 0; ib < 3; ib++) {
-      h[ia][ib] +=  fe->param->epsilon
-	*(fe->param->e0coswt[ia]*fe->param->e0coswt[ib] - r3*d[ia][ib]*e2);
+      double eb = fe->param->e0[ib]*fe->param->coswt;
+      h[ia][ib] +=  fe->param->epsilon*(ea*eb - r3*d[ia][ib]*e2);
     }
   }
 
@@ -1294,42 +1294,36 @@ int fe_lc_reduced_temperature(fe_lc_t * fe, double * tau) {
 
 /*****************************************************************************
  *
- *  fe_lc_dimensionless_field_strength
+ *  fe_lc_dimensonless_field_strength
+ *
+ *  Return the dimensionless (or reduced) field strength which is
+ *      ered^2 = (27 epsilon / 32 pi A_O gamma) E_a E_a
  *
- *  Return the dimensionless field strength which is
- *      e^2 = (27 epsilon / 32 pi A_O gamma) E_a E_a
+ *  The external field is e0[3] in lattice units. No phase.
  *
  *****************************************************************************/
 
-__host__ __device__
-int fe_lc_dimensionless_field_strength(fe_lc_t * fe, double * ered) {
+__host__ int fe_lc_dimensionless_field_strength(const fe_lc_param_t * param,
+						double * ered) {
 
-  int ia;
-  double a0;
-  double gamma;
-  double epsilon;
-  double fieldsq;
-  double e0[3];
-  physics_t * phys = NULL;
+  double fieldsq = 0.0;
   PI_DOUBLE(pi);
 
-  assert(fe);
-
-  physics_ref(&phys);
-  physics_e0(phys, e0);
+  assert(param);
 
-  fieldsq = 0.0;
-  for (ia = 0; ia < 3; ia++) {
-    fieldsq += e0[ia]*e0[ia];
+  for (int ia = 0; ia < 3; ia++) {
+    fieldsq += param->e0[ia]*param->e0[ia];
   }
 
   /* Remember epsilon is stored with factor (1/12pi) */ 
 
-  a0 = fe->param->a0;
-  gamma = fe->param->gamma;
-  epsilon = 12.0*pi*fe->param->epsilon;
+  {
+    double a0 = param->a0;
+    double gamma = param->gamma;
+    double epsilon = 12.0*pi*param->epsilon;
 
-  *ered = sqrt(27.0*epsilon*fieldsq/(32.0*pi*a0*gamma));
+    *ered = sqrt(27.0*epsilon*fieldsq/(32.0*pi*a0*gamma));
+  }
 
   return 0;
 }
@@ -1754,6 +1748,8 @@ void fe_lc_stress_v(fe_lc_t * fe, int index, double s[3][3][NSIMDVL]) {
   double * __restrict__ grad;
   double * __restrict__ delsq;
 
+  assert(fe);
+
   data = fe->q->data;
   grad = fe->dq->grad;
   delsq = fe->dq->delsq;
@@ -1791,7 +1787,40 @@ void fe_lc_stress_v(fe_lc_t * fe, int index, double s[3][3][NSIMDVL]) {
   for_simd_v(iv, NSIMDVL) dsq[Z][Z][iv] = 0.0 - dsq[X][X][iv] - dsq[Y][Y][iv];
 
   fe_lc_compute_h_v(fe, q, dq, dsq, h);
+#ifndef AMD_GPU_WORKAROUND
+  /* Standard computation. */
   fe_lc_compute_stress_v(fe, q, dq, h, s);
+#else
+  {
+    /* This is avoiding what appears to be a spurious memory access
+     * error arising from the unrolled stress. It's not entirely
+     * clear where this is coming form at the moment (beyond
+     * "something to do with optimisation" */
+    int ia, ib;
+    double q1[3][3];
+    double dq1[3][3][3];
+    double h1[3][3];
+    double s1[3][3];
+
+    for (iv = 0; iv < NSIMDVL; iv++) {
+      for (ia = 0; ia < 3; ia++) {
+	for (ib = 0; ib < 3; ib++) {
+	  q1[ia][ib] = q[ia][ib][iv];
+	  dq1[0][ia][ib] = dq[0][ia][ib][iv];
+	  dq1[1][ia][ib] = dq[1][ia][ib][iv];
+	  dq1[2][ia][ib] = dq[2][ia][ib][iv];
+	  h1[ia][ib] = h[ia][ib][iv];
+	}
+      }
+      fe_lc_compute_stress(fe, q1, dq1, h1, s1);
+      for (ia = 0; ia < 3; ia++) {
+	for (ib = 0; ib < 3; ib++) {
+	  s[ia][ib][iv] = s1[ia][ib];
+	}
+      }
+    }
+  }
+#endif
 
   if (fe->param->is_active) {
     int ib;
@@ -2024,9 +2053,11 @@ void fe_lc_compute_fed_v(fe_lc_t * fe,
   for_simd_v(iv, NSIMDVL)  efield[iv] = 0.0;
 
   for (ia = 0; ia < 3; ia++) {
+    double ea = fe->param->e0[ia]*fe->param->coswt;
     for (ib = 0; ib < 3; ib++) {
+      double eb = fe->param->e0[ib]*fe->param->coswt;
       for_simd_v(iv, NSIMDVL) {
-	efield[iv] += fe->param->e0coswt[ia]*q[ia][ib][iv]*fe->param->e0coswt[ib];
+	efield[iv] += ea*q[ia][ib][iv]*eb;
       }
     }
   }
@@ -2059,7 +2090,7 @@ void fe_lc_compute_fed_v(fe_lc_t * fe,
  *
  *****************************************************************************/
 
-__host__ __device__ __inline__
+__host__ __device__
 void fe_lc_compute_h_v(fe_lc_t * fe,
 		       double q[3][3][NSIMDVL], 
 		       double dq[3][3][3][NSIMDVL],
@@ -2217,16 +2248,18 @@ void fe_lc_compute_h_v(fe_lc_t * fe,
   for_simd_v(iv, NSIMDVL) e2[iv] = 0.0;
 
   for (ia = 0; ia < 3; ia++) {
+    double ea = fe->param->e0[ia]*fe->param->coswt;
     for_simd_v(iv, NSIMDVL) {
-      e2[iv] += fe->param->e0coswt[ia]*fe->param->e0coswt[ia];
+      e2[iv] += ea*ea;
     }
   }
 
   for (ia = 0; ia < 3; ia++) {
+    double ea = fe->param->e0[ia]*fe->param->coswt;
     for (ib = 0; ib < 3; ib++) {
+      double eb = fe->param->e0[ib]*fe->param->coswt;
       for_simd_v(iv, NSIMDVL) {
-	h[ia][ib][iv] +=  fe->param->epsilon*
-	  (fe->param->e0coswt[ia]*fe->param->e0coswt[ib] - r3*d[ia][ib]*e2[iv]);
+	h[ia][ib][iv] +=  fe->param->epsilon*(ea*eb - r3*d[ia][ib]*e2[iv]);
       }
     }
   }

diff --git a/src/blue_phase.h b/src/blue_phase.h
@@ -1,3 +1,4 @@
+
 /*****************************************************************************
  *
  *  fe_lc.h
@@ -65,7 +66,8 @@ struct fe_lc_param_s {
   double rredshift;                       /* Reciprocal redshift */
   double epsilon;                         /* Dielectric anistropy */
   double amplitude0;                      /* Initial amplitude from input */
-  double e0coswt[3];                      /* Electric field */
+  double e0[3];                           /* Electric field (external) */
+  double coswt;                           /* Electric field (phase) */
 
   double w1_coll;                         /* Anchoring strength parameter */
   double w2_coll;                         /* Second anchoring parameter */
@@ -147,9 +149,6 @@ int fe_lc_chirality(fe_lc_t * fe, double * chirality);
 __host__ __device__
 int fe_lc_reduced_temperature(fe_lc_t * fe,  double * tau);
 
-__host__ __device__
-int fe_lc_dimensionless_field_strength(fe_lc_t * fe, double * edm);
-
 __host__ __device__
 void fe_lc_mol_field_v(fe_lc_t * fe, int index, double h[3][3][NSIMDVL]);
 
@@ -184,6 +183,9 @@ int fe_lc_grad_stress(fe_lc_t * fe, int index, double sgrad[3][3]);
 
 /* Function of the parameters only */
 
+__host__ int fe_lc_dimensionless_field_strength(const fe_lc_param_t * param,
+						double * e0);
+
 __host__ __device__
 int fe_lc_amplitude_compute(fe_lc_param_t * param, double * a);