dotnet · jakobbotsch · Sep 29, 2022 · Sep 27, 2022 · Sep 27, 2022 · Sep 27, 2022
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
diff --git a/src/coreclr/scripts/superpmi.py b/src/coreclr/scripts/superpmi.py
@@ -342,7 +342,6 @@ def add_core_root_arguments(parser, build_type_default, build_type_help):
 asm_diff_parser.add_argument("--debuginfo", action="store_true", help="Include debug info after disassembly (sets COMPlus_JitDebugDump).")
 asm_diff_parser.add_argument("-tag", help="Specify a word to add to the directory name where the asm diffs will be placed")
 asm_diff_parser.add_argument("-metrics", action="append", help="Metrics option to pass to jit-analyze. Can be specified multiple times, or pass comma-separated values.")
-asm_diff_parser.add_argument("-retainOnlyTopFiles", action="store_true", help="Retain only top .dasm files with largest improvements or regressions and delete remaining files.")
 asm_diff_parser.add_argument("--diff_with_release", action="store_true", help="Specify if this is asmdiff using release binaries.")
 asm_diff_parser.add_argument("--git_diff", action="store_true", help="Produce a '.diff' file from 'base' and 'diff' folders if there were any differences.")
 
@@ -501,6 +500,11 @@ def read_csv_metrics(path):
 
     return dict
 
+def read_csv_diffs(path):
+    with open(path) as csv_file:
+        reader = csv.DictReader(csv_file)
+        return list(reader)
+
 def determine_clrjit_compiler_version(clrjit_path):
     """ Obtain the version of the compiler that was used to compile the clrjit at the specified path.
 
@@ -1568,7 +1572,7 @@ def replay_with_asm_diffs(self):
                 logging.info("Running asm diffs of %s", mch_file)
 
                 fail_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_fail.mcl")
-                diff_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff.mcl")
+                diffs_info = os.path.join(temp_location, os.path.basename(mch_file) + "_diffs.csv")
 
                 base_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_base_metrics.csv")
                 diff_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff_metrics.csv")
@@ -1577,7 +1581,7 @@ def replay_with_asm_diffs(self):
                     "-a",  # Asm diffs
                     "-v", "ewmi",  # display errors, warnings, missing, jit info
                     "-f", fail_mcl_file,  # Failing mc List
-                    "-diffMCList", diff_mcl_file,  # Create all of the diffs in an mcl file
+                    "-diffsInfo", diffs_info,  # Information about diffs
                     "-r", os.path.join(temp_location, "repro"),  # Repro name, create .mc repro files
                     "-baseMetricsSummary", base_metrics_summary_file, # Create summary of metrics we can use to get total code size impact
                     "-diffMetricsSummary", diff_metrics_summary_file,
@@ -1633,8 +1637,10 @@ def replay_with_asm_diffs(self):
                             repro_base_command_line = "{} {} {}".format(self.superpmi_path, " ".join(altjit_asm_diffs_flags), self.diff_jit_path)
                             save_repro_mc_files(temp_location, self.coreclr_args, artifacts_base_name, repro_base_command_line)
 
+                diffs = read_csv_diffs(diffs_info)
+
                 # This file had asm diffs; keep track of that.
-                has_diffs = is_nonzero_length_file(diff_mcl_file)
+                has_diffs = len(diffs) > 0
                 if has_diffs:
                     files_with_asm_diffs.append(mch_file)
 
@@ -1649,12 +1655,6 @@ def replay_with_asm_diffs(self):
                     if return_code == 0:
                         logging.warning("Warning: SuperPMI returned a zero exit code, but generated a non-zero-sized mcl file")
 
-                    self.diff_mcl_contents = None
-                    with open(diff_mcl_file) as file_handle:
-                        mcl_lines = file_handle.readlines()
-                        mcl_lines = [item.strip() for item in mcl_lines]
-                        self.diff_mcl_contents = mcl_lines
-
                     asm_root_dir = create_unique_directory_name(self.coreclr_args.spmi_location, "asm.{}".format(artifacts_base_name))
                     base_asm_location = os.path.join(asm_root_dir, "base")
                     diff_asm_location = os.path.join(asm_root_dir, "diff")
@@ -1672,13 +1672,13 @@ def replay_with_asm_diffs(self):
                     text_differences = queue.Queue()
                     jit_dump_differences = queue.Queue()
 
-                    async def create_replay_artifacts(print_prefix, item, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
+                    async def create_replay_artifacts(print_prefix, context_index, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
                         """ Run superpmi over an MC to create JIT asm or JIT dumps for the method.
                         """
                         # Setup flags to call SuperPMI for both the diff jit and the base jit
 
                         flags = [
-                            "-c", item,
+                            "-c", str(context_index),
                             "-v", "q"  # only log from the jit.
                         ]
                         flags += altjit_replay_flags
@@ -1690,7 +1690,7 @@ async def create_replay_artifacts(print_prefix, item, self, mch_file, env, jit_d
 
                             async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                                 command = [self.superpmi_path] + flags + [jit_path, mch_file]
-                                item_path = os.path.join(location, "{}{}".format(item, extension))
+                                item_path = os.path.join(location, "{}{}".format(context_index, extension))
                                 modified_env = env.copy()
                                 modified_env['COMPlus_JitStdOutFile'] = item_path
                                 logging.debug("%sGenerating %s", print_prefix, item_path)
@@ -1706,22 +1706,20 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                             diff_txt = await create_one_artifact(self.diff_jit_path, diff_location, flags + diff_option_flags_for_diff_artifact)
 
                             if base_txt != diff_txt:
-                                jit_differences_queue.put_nowait(item)
+                                jit_differences_queue.put_nowait(context_index)
                     ################################################################################################ end of create_replay_artifacts()
 
-                    diff_items = []
-                    for item in self.diff_mcl_contents:
-                        diff_items.append(item)
-
                     logging.info("Creating dasm files: %s %s", base_asm_location, diff_asm_location)
-                    subproc_helper = AsyncSubprocessHelper(diff_items, verbose=True)
+                    dasm_contexts = self.pick_contexts_to_disassemble(diffs)
+                    subproc_helper = AsyncSubprocessHelper(dasm_contexts, verbose=True)
                     subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, asm_complus_vars_full_env, text_differences, base_asm_location, diff_asm_location, ".dasm")
 
                     if self.coreclr_args.diff_jit_dump:
                         logging.info("Creating JitDump files: %s %s", base_dump_location, diff_dump_location)
                         subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, jit_dump_complus_vars_full_env, jit_dump_differences, base_dump_location, diff_dump_location, ".txt")
 
-                    logging.info("Differences found. To replay SuperPMI use:")
+                    logging.info("Contexts with differences found. To replay SuperPMI use:".format(len(diffs)))
+
                     logging.info("")
                     for var, value in asm_complus_vars.items():
                         print_platform_specific_environment_vars(logging.INFO, self.coreclr_args, var, value)
@@ -1736,9 +1734,10 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                         logging.info("%s %s -c ### %s %s", self.superpmi_path, " ".join(altjit_replay_flags), self.diff_jit_path, mch_file)
                         logging.info("")
 
-                    logging.debug("Method numbers with binary differences:")
-                    for item in self.diff_mcl_contents:
-                        logging.debug(item)
+                    smallest = sorted(diffs, key=lambda r: int(r["Context size"]))[20:]
+                    logging.debug("Smallest {} contexts with binary differences:".format(len(smallest)))
+                    for diff in smallest:
+                        logging.debug(diff["Context"])
                     logging.debug("")
 
                     if base_metrics is not None and diff_metrics is not None:
@@ -1769,13 +1768,22 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                                 # It appears we have a built jit-analyze on the path, so try to run it.
                                 jit_analyze_summary_file = os.path.join(asm_root_dir, "summary.md")
                                 command = [ jit_analyze_path, "--md", jit_analyze_summary_file, "-r", "--base", base_asm_location, "--diff", diff_asm_location ]
-                                if self.coreclr_args.retainOnlyTopFiles:
-                                    command += [ "--retainOnlyTopFiles" ]
                                 if self.coreclr_args.metrics:
                                     command += [ "--metrics", ",".join(self.coreclr_args.metrics) ]
                                 elif base_bytes is not None and diff_bytes is not None:
                                     command += [ "--override-total-base-metric", str(base_bytes), "--override-total-diff-metric", str(diff_bytes) ]
 
+                                if len(dasm_contexts) < len(diffs):
+                                    # Avoid producing analysis output that assumes these are all contexts
+                                    # with diffs. When no custom metrics are specified we pick only a subset
+                                    # of interesting contexts to disassemble, to avoid spending too long.
+                                    # See pick_contexts_to_disassemble.
+                                    command += [ "--is-subset-of-diffs" ]
+                                else:
+                                    # Ask jit-analyze to avoid producing analysis output that assumes these are
+                                    # all contexts (we never produce .dasm files for contexts without diffs).
+                                    command += [ "--is-diffs-only" ]
+
                                 run_and_log(command, logging.INFO)
                                 ran_jit_analyze = True
 
@@ -1800,6 +1808,14 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                     else:
                         logging.warning("No textual differences. Is this an issue with coredistools?")
 
+                    # If we are not specifying custom metrics then print a summary here, otherwise leave the summarization up to jit-analyze.
+                    if self.coreclr_args.metrics is None:
+                        num_improvements = sum(1 for r in diffs if int(r["Diff size"]) < int(r["Base size"]))
+                        num_regressions = sum(1 for r in diffs if int(r["Diff size"]) > int(r["Base size"]))
+                        logging.info("{} contexts with differences found ({} improvements, {} regressions)".format(len(diffs), num_improvements, num_regressions))
+                        logging.info("")
+                        logging.info("")
+
                     if self.coreclr_args.diff_jit_dump:
                         try:
                             current_jit_dump_diff = jit_dump_differences.get_nowait()
@@ -1994,6 +2010,49 @@ def write_pivot_section(row):
         return result
         ################################################################################################ end of replay_with_asm_diffs()
 
+    def pick_contexts_to_disassemble(self, diffs):
+        """ Given information about diffs, pick the context numbers to create .dasm files for
+
+        Returns:
+            List of method context numbers to create disassembly for.
+        """
+
+        # If there are non-default metrics then we need to disassemble
+        # everything so that jit-analyze can handle those.
+        if self.coreclr_args.metrics is not None:
+            contexts = diffs
+        else:
+            # In the default case we have size improvements/regressions
+            # available without needing to disassemble all, so pick a subset of
+            # interesting diffs to pass to jit-analyze.
+
+            # 20 smallest method contexts with diffs
+            smallest_contexts = sorted(diffs, key=lambda r: int(r["Context size"]))[:20]
+            # Order by byte-wise improvement, largest improvements first
+            by_diff_size = sorted(diffs, key=lambda r: int(r["Diff size"]) - int(r["Base size"]))
+            # 20 top improvements, byte-wise
+            top_improvements_bytes = by_diff_size[:20]
+            # 20 top regressions, byte-wise
+            top_regressions_bytes = by_diff_size[-20:]
+
+            # Order by percentage-wise size improvement, largest improvements first
+            def diff_pct(r):
+                base = int(r["Base size"])
+                if base == 0:
+                    return 0
+                diff = int(r["Diff size"])
+                return (diff - base) / base
+
+            by_diff_size_pct = sorted(diffs, key=diff_pct)
+            top_improvements_pct = by_diff_size_pct[:20]
+            top_regressions_pct = by_diff_size_pct[-20:]
+
+            contexts = smallest_contexts + top_improvements_bytes + top_regressions_bytes + top_improvements_pct + top_regressions_pct
+
+        final_contexts_indices = list(set(int(r["Context"]) for r in contexts))
+        final_contexts_indices.sort()
+        return final_contexts_indices
+
 ################################################################################
 # SuperPMI Replay/TP diff
 ################################################################################
@@ -3908,11 +3967,6 @@ def verify_base_diff_args():
                             lambda unused: True,
                             "Unable to set metrics.")
 
-        coreclr_args.verify(args,
-                            "retainOnlyTopFiles",
-                            lambda unused: True,
-                            "Unable to set retainOnlyTopFiles.")
-
         coreclr_args.verify(args,
                             "diff_with_release",
                             lambda unused: True,

diff --git a/src/coreclr/scripts/superpmi_diffs.py b/src/coreclr/scripts/superpmi_diffs.py
@@ -216,8 +216,7 @@ def main(main_args):
         "-spmi_location", spmi_location,
         "-error_limit", "100",
         "-log_level", "debug",
-        "-log_file", log_file,
-        "-retainOnlyTopFiles"])
+        "-log_file", log_file)
 
     print("Running superpmi.py tpdiff")
 

diff --git a/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt b/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
@@ -26,6 +26,7 @@ set(SUPERPMI_SOURCES
     neardiffer.cpp
     parallelsuperpmi.cpp
     superpmi.cpp
+    fileio.cpp
     jithost.cpp
     ../superpmi-shared/callutils.cpp
     ../superpmi-shared/compileresult.cpp

diff --git a/src/coreclr/tools/superpmi/superpmi/commandline.cpp b/src/coreclr/tools/superpmi/superpmi/commandline.cpp
@@ -318,15 +318,15 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
 
                 o->mclFilename = argv[i];
             }
-            else if ((_strnicmp(&argv[i][1], "diffMCList", 10) == 0))
+            else if ((_strnicmp(&argv[i][1], "diffsInfo", 9) == 0))
             {
                 if (++i >= argc)
                 {
                     DumpHelp(argv[0]);
                     return false;
                 }
 
-                o->diffMCLFilename = argv[i];
+                o->diffsInfo = argv[i];
             }
             else if ((_strnicmp(&argv[i][1], "target", 6) == 0))
             {
@@ -641,9 +641,9 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
         DumpHelp(argv[0]);
         return false;
     }
-    if (o->diffMCLFilename != nullptr && !o->applyDiff)
+    if (o->diffsInfo != nullptr && !o->applyDiff)
     {
-        LogError("-diffMCList specified without -applyDiff.");
+        LogError("-diffsInfo specified without -applyDiff.");
         DumpHelp(argv[0]);
         return false;
     }

diff --git a/src/coreclr/tools/superpmi/superpmi/commandline.h b/src/coreclr/tools/superpmi/superpmi/commandline.h
@@ -39,7 +39,7 @@ class CommandLine
             , baseMetricsSummaryFile(nullptr)
             , diffMetricsSummaryFile(nullptr)
             , mclFilename(nullptr)
-            , diffMCLFilename(nullptr)
+            , diffsInfo(nullptr)
             , targetArchitecture(nullptr)
             , compileList(nullptr)
             , offset(-1)
@@ -72,7 +72,7 @@ class CommandLine
         char* baseMetricsSummaryFile;
         char* diffMetricsSummaryFile;
         char* mclFilename;
-        char* diffMCLFilename;
+        char* diffsInfo;
         char* targetArchitecture;
         char* compileList;
         int   offset;