From 6f1c79eb6101529e1534fb27ff23b74bd3b38f69 Mon Sep 17 00:00:00 2001
From: Marshall Ward <marshall.ward@noaa.gov>
Date: Tue, 2 Apr 2024 14:55:26 -0400
Subject: [PATCH] CPP expression parser for makedep

This patch adds a relatively robust parser for C preprocessor
expressions inside of an #if statement.

The following are supported:

* Nearly all operators, including arithmetic, logical, and bitwise,

* Parentheses within expressions,

* defined() evaluations.

The following are explicitly not supported:

* Function macros,

* Multiline preprocessors.

No doubt there are other lingering issues, but this is comprehensive
enough to handle both MOM6 as well as current and legacy FMS source
codes.

Existing Makefile.dep output files appear to be mostly unchanged.  One
rule (data_override.o) had its arguments reordered but is otherwise
unchanged.  mpp_data.o had its rule corrected to use mpp_util_mpi.inc
rather than mpp_util_nocomm.inc.

Some fixes and adjustments were made to the overall makedep source:

* Input macros (-D) are now stored as key-value dicts, rather than
  simply a list of macro names.

* Input macros are now passed to all scan_fortran_file() calls, rather
  than just the Fortran source.

* Input macros are now correctly passed to FMS makedep.  Previously,
  these were omitted from the Makefile generation.

* Previously, #if blocks were always set to True, even though the
  comments indicated that they were always set to False.  Given that
  neither of these was ever correct, it's amazing that we were able to
  survive this long without prior incident.

The motivation for this PR comes from issues with Makefile generation in
FMS.  Older versions of FMS were unable to correctly resolve their
dependencies in fft.f90 on certain systems (perhaps caused
by filesystem peculiarities).  Newer versions of FMS were unable to
handle the #if block default from True to False.  Inevitably, we threw
up our hands and solved the underlying problem.
---
 ac/deps/Makefile.fms.in |   2 +-
 ac/makedep              | 205 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 195 insertions(+), 12 deletions(-)

diff --git a/ac/deps/Makefile.fms.in b/ac/deps/Makefile.fms.in
index 71c46f082a..e4617f1428 100644
--- a/ac/deps/Makefile.fms.in
+++ b/ac/deps/Makefile.fms.in
@@ -23,4 +23,4 @@ ARFLAGS = @ARFLAGS@
 .PHONY: depend
 depend: Makefile.dep
 Makefile.dep:
-	$(PYTHON) $(MAKEDEP) -o Makefile.dep -e -x libFMS.a -s @srcdir@/test_fms @srcdir@
+	$(PYTHON) $(MAKEDEP) $(DEFS) -o Makefile.dep -e -x libFMS.a -s @srcdir@/test_fms @srcdir@
diff --git a/ac/makedep b/ac/makedep
index e37f35aca5..4c9cc9229b 100755
--- a/ac/makedep
+++ b/ac/makedep
@@ -10,7 +10,8 @@ import re
 import sys
 
 
-# Pre-compile re searches
+# Fortran tokenization
+
 re_module = re.compile(r"^ *module +([a-z_0-9]+)")
 re_use = re.compile(r"^ *use +([a-z_0-9]+)")
 re_cpp_define = re.compile(r"^ *# *define +[_a-zA-Z][_a-zA-Z0-9]")
@@ -32,6 +33,80 @@ re_procedure = re.compile(
 )
 
 
+# Preprocessor expression tokenization
+cpp_scanner = re.Scanner([
+  (r'defined', lambda scanner, token: token),
+  (r'[_A-Za-z][_0-9a-zA-Z]*', lambda scanner, token: token),
+  (r'[0-9]+', lambda scanner, token: token),
+  (r'\(', lambda scanner, token: token),
+  (r'\)', lambda scanner, token: token),
+  (r'\*', lambda scanner, token: token),
+  (r'/', lambda scanner, token: token),
+  (r'\+', lambda scanner, token: token),
+  (r'-', lambda scanner, token: token),
+  (r'!', lambda scanner, token: token),
+  (r'>>', lambda scanner, token: token),
+  (r'>=', lambda scanner, token: token),
+  (r'>', lambda scanner, token: token),
+  (r'<<', lambda scanner, token: token),
+  (r'<=', lambda scanner, token: token),
+  (r'<', lambda scanner, token: token),
+  (r'==', lambda scanner, token: token),
+  (r'&&', lambda scanner, token: token),
+  (r'&', lambda scanner, token: token),
+  (r'\|\|', lambda scanner, token: token),
+  (r'\|', lambda scanner, token: token),
+  (r'^\#if', None),
+  (r'\s+', None),
+])
+
+
+cpp_operate = {
+    '!': lambda x: not x,
+    '*': lambda x, y: x * y,
+    '/': lambda x, y: x // y,
+    '+': lambda x, y: x + y,
+    '-': lambda x, y: x - y,
+    '>>': lambda x, y: x >> y,
+    '<<': lambda x, y: x << y,
+    '==': lambda x, y: x == y,
+    '>': lambda x, y: x > y,
+    '>=': lambda x, y: x >= y,
+    '<': lambda x, y: x < y,
+    '<=': lambda x, y: x <= y,
+    '&': lambda x, y: x & y,
+    '^': lambda x, y: x ^ y,
+    '|': lambda x, y: x | y,
+    '&&': lambda x, y: x and y,
+    '||': lambda x, y: x or y,
+}
+
+
+cpp_op_rank = {
+    '(': 13,
+    '!': 12,
+    '*': 11,
+    '/': 11,
+    '+': 10,
+    '-': 10,
+    '>>': 9,
+    '<<': 9,
+    '>': 8,
+    '>=': 8,
+    '<': 8,
+    '<=': 8,
+    '==': 7,
+    '&': 6,
+    '^': 5,
+    '|': 4,
+    '&&': 2,
+    '||': 2,
+    ')': 1,
+    '$': 1,
+    None: 0,
+}
+
+
 def create_deps(src_dirs, skip_dirs, makefile, debug, exec_target, fc_rule,
                 link_externals, defines):
     """Create "makefile" after scanning "src_dis"."""
@@ -105,7 +180,7 @@ def create_deps(src_dirs, skip_dirs, makefile, debug, exec_target, fc_rule,
         all_modules += mods
 
     for f in c_files:
-        _, _, cpp, inc, _, _ = scan_fortran_file(f)
+        _, _, cpp, inc, _, _ = scan_fortran_file(f, defines)
         # maps object file to .h files included
         o2h[object_file(f)] = cpp
         externals.append(object_file(f))
@@ -158,7 +233,7 @@ def create_deps(src_dirs, skip_dirs, makefile, debug, exec_target, fc_rule,
             ]
             missing_mods = [m for m in o2uses[o] if m not in all_modules]
 
-            incs, inc_used = nested_inc(o2h[o] + o2inc[o], f2F)
+            incs, inc_used = nested_inc(o2h[o] + o2inc[o], f2F, defines)
             inc_mods = [u for u in inc_used if u not in found_mods and u in all_modules]
 
             incdeps = sorted(set([f2F[f] for f in incs if f in f2F]))
@@ -250,7 +325,7 @@ def link_obj(obj, o2uses, mod2o, all_modules):
     return sorted(set(olst))
 
 
-def nested_inc(inc_files, f2F):
+def nested_inc(inc_files, f2F, defines):
     """List of all files included by "inc_files", either by #include or F90
     include."""
     hlst = []
@@ -260,7 +335,7 @@ def nested_inc(inc_files, f2F):
         if hfile not in f2F.keys():
             return
 
-        _, used, cpp, inc, _, _ = scan_fortran_file(f2F[hfile])
+        _, used, cpp, inc, _, _ = scan_fortran_file(f2F[hfile], defines)
 
         # Record any module updates inside of include files
         used_mods.update(used)
@@ -286,7 +361,8 @@ def scan_fortran_file(src_file, defines=None):
 
     cpp_defines = defines if defines is not None else []
 
-    cpp_macros = [define.split('=')[0] for define in cpp_defines]
+    #cpp_macros = [define.split('=')[0] for define in cpp_defines]
+    cpp_macros = dict([t.split('=') for t in cpp_defines])
     cpp_group_stack = []
 
     with io.open(src_file, 'r', errors='replace') as file:
@@ -328,9 +404,9 @@ def scan_fortran_file(src_file, defines=None):
             if match:
                 cpp_group_stack.append(cpp_exclude)
 
-                # XXX: Don't attempt to parse #if statements, but store the state.
-                # if/endif stack.  For now, assume that these always fail.
-                cpp_exclude = False
+                cpp_expr_value = cpp_expr_eval(line, cpp_macros)
+
+                cpp_exclude = not cpp_expr_value
 
             # Complement #else condition group
             match = re_cpp_else.match(line)
@@ -351,8 +427,14 @@ def scan_fortran_file(src_file, defines=None):
             # Activate a new macro (ignoring the value)
             match = re_cpp_define.match(line)
             if match:
-                new_macro = line.lstrip()[1:].split()[1]
-                cpp_macros.append(new_macro)
+                tokens = line.strip()[1:].split(maxsplit=2)
+                macro = tokens[1]
+                value = tokens[2] if tokens[2:] else None
+                if '(' in macro:
+                    # TODO: Actual handling of function macros
+                    macro, arg = macro.split('(', maxsplit=1)
+                    value = '(' + arg + value
+                cpp_macros[macro] = value
 
             # Deactivate a macro
             match = re_cpp_undef.match(line)
@@ -441,6 +523,107 @@ def add_suff(lst, suff):
     return [f + suff for f in lst]
 
 
+def cpp_expr_eval(expr, macros=None):
+    if macros is None:
+        macros = {}
+
+    results, remainder = cpp_scanner.scan(expr)
+
+    # Abort if any characters are not tokenized
+    if remainder:
+        print('There are untokenized characters!')
+        print('Expression:', repr(expr))
+        print('Tokens:', results)
+        print('Unscanned:', remainder)
+        raise
+
+    # Add an "end of line" character to force evaluation of the final tokens.
+    results.append('$')
+
+    stack = []
+    prior_op = None
+
+    tokens = iter(results)
+    for tok in tokens:
+        # Evaluate "defined()" statements
+        if tok == 'defined':
+            tok = next(tokens)
+
+            parens = tok == '('
+            if parens:
+                tok = next(tokens)
+
+            # NOTE: Any key in `macros` is considered to be set, even if the
+            # value is None.
+            value = tok in macros
+
+            # Negation
+            while prior_op == '!':
+                op = stack.pop()
+                assert op == '!'
+                value = cpp_operate[op](value)
+                prior_op = stack[-1] if stack else None
+
+            stack.append(value)
+
+            if parens:
+                tok = next(tokens)
+                assert tok == ')'
+
+        elif tok.isdigit():
+            value = int(tok)
+            stack.append(value)
+
+        elif tok.isidentifier():
+            # "Identifiers that are not macros, which are all considered to be
+            # the number zero." (CPP manual, 4.2.2)
+            value = macros.get(tok, '0')
+            if value.isdigit():
+                value = int(value)
+            stack.append(value)
+
+        elif tok in cpp_op_rank.keys():
+            while cpp_op_rank[tok] <= cpp_op_rank[prior_op]:
+
+                # Skip unary prefix operators (only '!' at the moment)
+                if tok == '!':
+                    break
+
+                second = stack.pop()
+                op = stack.pop()
+                first = stack.pop()
+
+                value = cpp_operate[op](first, second)
+                prior_op = stack[-1] if stack else None
+
+                if prior_op == '(':
+                    prior_op = None
+                    if tok == ')':
+                        stack.pop()
+
+                stack.append(value)
+
+            if tok == ')':
+                prior_op = stack[-2] if stack and len(stack) > 1 else None
+            else:
+                stack.append(tok)
+                prior_op = tok
+
+                if prior_op in ('(',):
+                    prior_op = None
+
+        else:
+            print("Unsupported token:", tok)
+            raise
+
+    # Remove the tail value
+    eol = stack.pop()
+    assert eol == '$'
+    value = stack.pop()
+
+    return value
+
+
 # Parse arguments
 parser = argparse.ArgumentParser(
     description="Generate make dependencies for F90 source code."