diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index c14ec464a43a66..4ebf304e23d587 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -233,10 +233,7 @@ linux_projects=$(add-dependencies ${linux_projects_to_test} | sort | uniq)
 
 windows_projects_to_test=$(exclude-windows $(compute-projects-to-test ${modified_projects}))
 windows_check_targets=$(check-targets ${windows_projects_to_test} | sort | uniq)
-# Temporary disable the windows job.
-# See https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840
-#windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq)
-windows_projects=""
+windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq)
 
 # Generate the appropriate pipeline
 if [[ "${linux_projects}" != "" ]]; then
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index a704e855f011cb..9561bf668a90cb 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -38,6 +38,12 @@ targets="${2}"
 
 echo "--- cmake"
 pip install -q -r ${MONOREPO_ROOT}/mlir/python/requirements.txt
+
+# The CMAKE_*_LINKER_FLAGS to disable the manifest come from research
+# on fixing a build reliability issue on the build server, please
+# see https://github.com/llvm/llvm-project/pull/82393 and
+# https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40
+# for further information.
 cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
@@ -49,7 +55,10 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
       -D COMPILER_RT_BUILD_ORC=OFF \
       -D CMAKE_C_COMPILER_LAUNCHER=sccache \
       -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
-      -D MLIR_ENABLE_BINDINGS_PYTHON=ON
+      -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
+      -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
+      -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
+      -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO"
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index f2a831ad3577ad..53da8662b0203a 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -28,6 +28,7 @@ jobs:
     name: Create a New Release
     runs-on: ubuntu-latest
     needs: validate-tag
+
     steps:
       - name: Install Dependencies
         run: |
@@ -40,8 +41,9 @@ jobs:
       - name: Create Release
         env:
           GITHUB_TOKEN: ${{ github.token }}
+          USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
         run: |
-          ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} create
+          ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} --user-token "$USER_TOKEN" create
   release-documentation:
     name: Build and Upload Release Documentation
     needs:
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index fb4d7a02dd086f..711baf45f449a0 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -833,6 +833,7 @@ to ``float``; see below for more information on this emulation.
   * 32-bit ARM (natively on some architecture versions)
   * 64-bit ARM (AArch64) (natively on ARMv8.2a and above)
   * AMDGPU (natively)
+  * NVPTX (natively)
   * SPIR (natively)
   * X86 (if SSE2 is available; natively if AVX512-FP16 is also available)
   * RISC-V (natively if Zfh or Zhinx is available)
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 3d456610e7a7f1..60a499533510d9 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -255,6 +255,11 @@ class Driver {
   /// from non-system headers are emitted.
   HeaderIncludeFilteringKind CCPrintHeadersFiltering = HIFIL_None;
 
+  /// Name of the library that provides implementations of
+  /// IEEE-754 128-bit float math functions used by Fortran F128
+  /// runtime library. It should be linked as needed by the linker job.
+  std::string FlangF128MathLibrary;
+
   /// Set CC_LOG_DIAGNOSTICS mode, which causes the frontend to log diagnostics
   /// to CCLogDiagnosticsFilename or to stderr, in a stable machine readable
   /// format.
@@ -446,6 +451,11 @@ class Driver {
   bool offloadHostOnly() const { return Offload == OffloadHost; }
   bool offloadDeviceOnly() const { return Offload == OffloadDevice; }
 
+  void setFlangF128MathLibrary(std::string name) {
+    FlangF128MathLibrary = std::move(name);
+  }
+  StringRef getFlangF128MathLibrary() const { return FlangF128MathLibrary; }
+
   /// Compute the desired OpenMP runtime from the flags provided.
   OpenMPRuntimeKind getOpenMPRuntime(const llvm::opt::ArgList &Args) const;
 
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index a8efae3a1ce388..b47c399fef6042 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -61,6 +61,10 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   NoAsmVariants = true;
   GPU = CudaArch::UNUSED;
 
+  // PTX supports f16 as a fundamental type.
+  HasLegalHalfType = true;
+  HasFloat16 = true;
+
   if (TargetPointerWidth == 32)
     resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
   else if (Opts.NVPTXUseShortPointers)
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index cd955b6c849456..d5fc1d5dd25a8b 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -368,11 +368,7 @@ void BareMetal::AddLinkRuntimeLib(const ArgList &Args,
   ToolChain::RuntimeLibType RLT = GetRuntimeLibType(Args);
   switch (RLT) {
   case ToolChain::RLT_CompilerRT: {
-    const std::string FileName = getCompilerRT(Args, "builtins");
-    llvm::StringRef BaseName = llvm::sys::path::filename(FileName);
-    BaseName.consume_front("lib");
-    BaseName.consume_back(".a");
-    CmdArgs.push_back(Args.MakeArgString("-l" + BaseName));
+    CmdArgs.push_back(getCompilerRTArgString(Args, "builtins"));
     return;
   }
   case ToolChain::RLT_Libgcc:
@@ -462,11 +458,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   for (const auto &LibPath : TC.getLibraryPaths())
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath)));
 
-  const std::string FileName = TC.getCompilerRT(Args, "builtins");
-  llvm::SmallString<128> PathBuf{FileName};
-  llvm::sys::path::remove_filename(PathBuf);
-  CmdArgs.push_back(Args.MakeArgString("-L" + PathBuf));
-
   if (TC.ShouldLinkCXXStdlib(Args))
     TC.AddCXXStdlibLibArgs(Args, CmdArgs);
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e031f60731834d..9c7a0e9699c59b 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1396,6 +1396,14 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
   // add the correct libraries to link against as dependents in the object
   // file.
   if (!TC.getTriple().isKnownWindowsMSVCEnvironment()) {
+    StringRef f128LibName = TC.getDriver().getFlangF128MathLibrary();
+    f128LibName.consume_front_insensitive("lib");
+    if (!f128LibName.empty()) {
+      CmdArgs.push_back("-lFortranFloat128Math");
+      addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true);
+      CmdArgs.push_back(Args.MakeArgString("-l" + f128LibName));
+      addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false);
+    }
     CmdArgs.push_back("-lFortranRuntime");
     CmdArgs.push_back("-lFortranDecimal");
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 8d344f9b63961a..8b41a949fd6734 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -170,6 +170,9 @@ class UncountedCallArgsChecker
     if (!Callee)
       return false;
 
+    if (isMethodOnWTFContainerType(Callee))
+      return true;
+
     auto overloadedOperatorType = Callee->getOverloadedOperator();
     if (overloadedOperatorType == OO_EqualEqual ||
         overloadedOperatorType == OO_ExclaimEqual ||
@@ -198,6 +201,31 @@ class UncountedCallArgsChecker
     return false;
   }
 
+  bool isMethodOnWTFContainerType(const FunctionDecl *Decl) const {
+    if (!isa<CXXMethodDecl>(Decl))
+      return false;
+    auto *ClassDecl = Decl->getParent();
+    if (!ClassDecl || !isa<CXXRecordDecl>(ClassDecl))
+      return false;
+
+    auto *NsDecl = ClassDecl->getParent();
+    if (!NsDecl || !isa<NamespaceDecl>(NsDecl))
+      return false;
+
+    auto MethodName = safeGetName(Decl);
+    auto ClsNameStr = safeGetName(ClassDecl);
+    StringRef ClsName = ClsNameStr; // FIXME: Make safeGetName return StringRef.
+    auto NamespaceName = safeGetName(NsDecl);
+    // FIXME: These should be implemented via attributes.
+    return NamespaceName == "WTF" &&
+           (MethodName == "find" || MethodName == "findIf" ||
+            MethodName == "reverseFind" || MethodName == "reverseFindIf" ||
+            MethodName == "get" || MethodName == "inlineGet" ||
+            MethodName == "contains" || MethodName == "containsIf") &&
+           (ClsName.ends_with("Vector") || ClsName.ends_with("Set") ||
+            ClsName.ends_with("Map"));
+  }
+
   void reportBug(const Expr *CallArg, const ParmVarDecl *Param) const {
     assert(CallArg);
 
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp
new file mode 100644
index 00000000000000..0a63a789856127
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+
+#include "mock-types.h"
+
+namespace WTF {
+
+  template <typename T>
+  class HashSet {
+  public:
+    template <typename U> T* find(U&) const;
+    template <typename U> bool contains(U&) const;
+    unsigned size() { return m_size; }
+    template <typename U> void add(U&) const;
+    template <typename U> void remove(U&) const;
+
+  private:
+    T* m_table { nullptr };
+    unsigned m_size { 0 };
+  };
+
+  template <typename T, typename S>
+  class HashMap {
+  public:
+    struct Item {
+      T key;
+      S value;
+    };
+
+    template <typename U> Item* find(U&) const;
+    template <typename U> bool contains(U&) const;
+    template <typename U> S* get(U&) const;
+    template <typename U> S* inlineGet(U&) const;
+    template <typename U> void add(U&) const;
+    template <typename U> void remove(U&) const;
+
+  private:
+    Item* m_table { nullptr };
+  };
+
+  template <typename T>
+  class WeakHashSet {
+  public:
+    template <typename U> T* find(U&) const;
+    template <typename U> bool contains(U&) const;
+    template <typename U> void add(U&) const;
+    template <typename U> void remove(U&) const;
+  };
+
+  template <typename T>
+  class Vector {
+  public:
+    unsigned size() { return m_size; }
+    T& at(unsigned i) { return m_buffer[i]; }
+    T& operator[](unsigned i) { return m_buffer[i]; }
+    template <typename U> unsigned find(U&);
+    template <typename U> unsigned reverseFind(U&);
+    template <typename U> bool contains(U&);
+    template <typename MatchFunction> unsigned findIf(const MatchFunction& match)
+    {
+      for (unsigned i = 0; i < m_size; ++i) {
+        if (match(at(i)))
+          return i;
+      }
+      return static_cast<unsigned>(-1);
+    }
+    template <typename MatchFunction> unsigned reverseFindIf(const MatchFunction& match)
+    {
+      for (unsigned i = 0; i < m_size; ++i) {
+        if (match(at(m_size - i)))
+          return i;
+      }
+      return static_cast<unsigned>(-1);
+    }
+    template <typename MatchFunction> bool containsIf(const MatchFunction& match)
+    {
+      for (unsigned i = 0; i < m_size; ++i) {
+        if (match(at(m_size - i)))
+          return true;
+      }
+      return false;
+    }
+    template <typename U> void append(U&) const;
+    template <typename U> void remove(U&) const;
+
+  private:
+    T* m_buffer { nullptr };
+    unsigned m_size { 0 };
+  };
+
+}
+
+using WTF::HashSet;
+using WTF::HashMap;
+using WTF::WeakHashSet;
+using WTF::Vector;
+
+class RefCounted {
+public:
+  void ref() const;
+  void deref() const;
+};
+
+RefCounted* object();
+
+void test() {
+  HashSet<RefPtr<RefCounted>> set;
+  set.find(*object());
+  set.contains(*object());
+  set.add(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+  set.remove(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+
+  HashMap<Ref<RefCounted>, unsigned> map;
+  map.find(*object());
+  map.contains(*object());
+  map.inlineGet(*object());
+  map.add(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+  map.remove(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+
+  WeakHashSet<Ref<RefCounted>> weakSet;
+  weakSet.find(*object());
+  weakSet.contains(*object());
+  weakSet.add(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+  weakSet.remove(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+
+  Vector<Ref<RefCounted>> vector;
+  vector.at(0);
+  vector[0];
+  vector.find(*object());
+  vector.reverseFind(*object());
+  vector.contains(*object());
+  vector.append(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+  vector.remove(*object());
+  // expected-warning@-1{{Call argument is uncounted and unsafe}}
+
+  auto* obj = object();
+  vector.findIf([&](Ref<RefCounted> key) { return key.ptr() == obj; });
+  vector.reverseFindIf([&](Ref<RefCounted> key) { return key.ptr() == obj; });
+  vector.containsIf([&](Ref<RefCounted> key) { return key.ptr() == obj; });
+}
\ No newline at end of file
diff --git a/clang/test/Driver/arm-compiler-rt.c b/clang/test/Driver/arm-compiler-rt.c
index 954947bb890f87..adecacbcaabf9c 100644
--- a/clang/test/Driver/arm-compiler-rt.c
+++ b/clang/test/Driver/arm-compiler-rt.c
@@ -3,7 +3,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_arch_subdir \
 // RUN:     -rtlib=compiler-rt -### %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix ARM-EABI
-// ARM-EABI: "-lclang_rt.builtins-arm"
+// ARM-EABI: "{{[^"]*}}libclang_rt.builtins-arm.a"
 
 // RUN: %clang -target arm-linux-gnueabi \
 // RUN:     --sysroot=%S/Inputs/resource_dir_with_arch_subdir \
diff --git a/clang/test/Driver/baremetal-multilib.yaml b/clang/test/Driver/baremetal-multilib.yaml
index af26e82621c91e..3f026cbeb437b2 100644
--- a/clang/test/Driver/baremetal-multilib.yaml
+++ b/clang/test/Driver/baremetal-multilib.yaml
@@ -17,7 +17,7 @@
 # CHECK-SAME: "-x" "c++" "{{.*}}baremetal-multilib.yaml"
 # CHECK-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 # CHECK-SAME: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/lib"
-# CHECK-SAME: "-lc" "-lm" "-lclang_rt.builtins"
+# CHECK-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a"
 # CHECK-SAME: "-o" "{{.*}}.tmp.out"
 
 # RUN: %T/baremetal_multilib/bin/clang -no-canonical-prefixes -x c++ %s -### -o %t.out 2>&1 \
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index fc66020772a771..46338185ffd9d5 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -18,5 +18,5 @@
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp"
 // CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m"
+// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a"
 // CHECK-V6M-C-SAME: "-o" "{{.*}}.o"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 7511d7d1adb4dd..8baf388894eb27 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -18,8 +18,7 @@
 // CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
-// CHECK-V6M-C-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "{{.*}}.tmp.out"
+// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "{{.*}}.tmp.out"
 
 // RUN: %clang %s -### --target=armv6m-none-eabi -nostdlibinc -nobuiltininc 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBINC %s
@@ -37,16 +36,15 @@
 // CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
 // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi
-// CHECK-ARMV7M-PER-TARGET: "-lc" "-lm" "-lclang_rt.builtins"
+// CHECK-ARMV7M-PER-TARGET: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-DEFAULTCXX %s
 // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-DEFAULTCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi -stdlib=libc++ 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBCXX %s
@@ -55,9 +53,8 @@
 // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-LIBCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-V6M-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm \
@@ -68,9 +65,8 @@
 // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0"
 // CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-LIBSTDCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm \
@@ -79,7 +75,6 @@
 // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-NDL-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 
 // RUN: rm -rf %T/baremetal_cxx_sysroot
 // RUN: mkdir -p %T/baremetal_cxx_sysroot/usr/include/c++/v1
@@ -93,9 +88,8 @@
 // CHECK-V6M-LIBCXX-USR-SAME: "-internal-isystem" "{{[^"]+}}baremetal_cxx_sysroot{{[/\\]+}}usr{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic"
 // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib"
-// CHECK-V6M-LIBCXX-USR-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m"
+// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a"
 
 // RUN: %clangxx --target=arm-none-eabi -v 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-THREAD-MODEL
@@ -178,8 +172,7 @@
 // CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV64-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
-// CHECK-RV64-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "{{.*}}.tmp.out"
+// CHECK-RV64-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "{{.*}}.tmp.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -187,9 +180,8 @@
 // CHECK-RV64-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-DEFAULTCXX-SAME: "-L[[RESOURCE_DIR]]{{.*}}{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "a.out"
+// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -200,9 +192,8 @@
 // CHECK-RV64-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-LIBCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "a.out"
+// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -213,9 +204,8 @@
 // CHECK-RV64-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
 // CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-LIBSTDCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "a.out"
+// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -L some/directory/user/asked/for \
@@ -230,8 +220,7 @@
 // CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV32-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
-// CHECK-RV32-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out"
+// CHECK-RV32-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -239,9 +228,8 @@
 // CHECK-RV32-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-DEFAULTCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out"
+// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -252,9 +240,8 @@
 // CHECK-RV32-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-LIBCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out"
+// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -265,9 +252,8 @@
 // CHECK-RV32-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
 // CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-LIBSTDCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
@@ -286,7 +272,6 @@
 // CHECK-RV64-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-NDL-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -march=rv64imafdc -mabi=lp64d \
@@ -306,7 +291,6 @@
 // CHECK-RV64FD-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64FD-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}lib"
-// CHECK-RV64FD-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -march=rv32i -mabi=ilp32 \
@@ -326,7 +310,6 @@
 // CHECK-RV32I-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-RV32I-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32I-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}lib"
-// CHECK-RV32I-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32i{{[/\\]+}}ilp32"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -march=rv32im -mabi=ilp32 \
@@ -346,7 +329,6 @@
 // CHECK-RV32IM-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IM-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}lib"
-// CHECK-RV32IM-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32im{{[/\\]+}}ilp32"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -march=rv32iac -mabi=ilp32 \
@@ -361,7 +343,6 @@
 // CHECK-RV32IAC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IAC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}lib"
-// CHECK-RV32IAC-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32iac{{[/\\]+}}ilp32"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf -march=rv32imafc -mabi=ilp32f \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -383,7 +364,6 @@
 // CHECK-RV32IMAFC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IMAFC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}lib"
-// CHECK-RV32IMAFC-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPCEABI %s
@@ -395,8 +375,7 @@
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCEABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
-// CHECK-PPCEABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpc" "-o" "a.out"
+// CHECK-PPCEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64EABI %s
@@ -408,8 +387,7 @@
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64EABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
-// CHECK-PPC64EABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpc64" "-o" "a.out"
+// CHECK-PPC64EABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpcle-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPCLEEABI %s
@@ -421,8 +399,7 @@
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCLEEABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
-// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpcle" "-o" "a.out"
+// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpcle.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64le-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LEEABI %s
@@ -434,8 +411,7 @@
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64LEEABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal"
-// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpc64le" "-o" "a.out"
+// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64le.a" "-o" "a.out"
 
 // Check that compiler-rt library without the arch filename suffix will
 // be used if present.
@@ -446,8 +422,8 @@
 // RUN:     --target=armv6m-none-eabi \
 // RUN:     --sysroot=%T/baremetal_clang_rt_noarch \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-NOARCH %s
-// CHECK-CLANGRT-NOARCH: "-lclang_rt.builtins"
-// CHECK-CLANGRT-NOARCH-NOT: "-lclang_rt.builtins-armv6m"
+// CHECK-CLANGRT-NOARCH: "{{[^"]*}}libclang_rt.builtins.a"
+// CHECK-CLANGRT-NOARCH-NOT: "{{[^"]*}}libclang_rt.builtins-armv6m.a"
 
 // Check that compiler-rt library with the arch filename suffix will be
 // used if present.
@@ -458,8 +434,8 @@
 // RUN:     --target=armv6m-none-eabi \
 // RUN:     --sysroot=%T/baremetal_clang_rt_arch \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-ARCH %s
-// CHECK-CLANGRT-ARCH: "-lclang_rt.builtins-armv6m"
-// CHECK-CLANGRT-ARCH-NOT: "-lclang_rt.builtins"
+// CHECK-CLANGRT-ARCH: "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-CLANGRT-ARCH-NOT: "{{[^"]*}}libclang_rt.builtins.a"
 
 // Check that "--no-relax" is forwarded to the linker for RISC-V.
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf -nostdinc -mno-relax \
@@ -471,4 +447,4 @@
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf -nostdinc \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-RELAX %s
-// CHECK-RV64-RELAX-NOT: "--no-relax"
\ No newline at end of file
+// CHECK-RV64-RELAX-NOT: "--no-relax"
diff --git a/clang/test/SemaCUDA/float16.cu b/clang/test/SemaCUDA/float16.cu
index a9cbe87f32c100..bb5ed606438491 100644
--- a/clang/test/SemaCUDA/float16.cu
+++ b/clang/test/SemaCUDA/float16.cu
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple nvptx64 -verify %s
 // expected-no-diagnostics
 #include "Inputs/cuda.h"
 
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index f8ad39ba712f8c..21617aeea0215e 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -33,6 +33,17 @@ endif()
 
 option(FLANG_ENABLE_WERROR "Fail and stop building flang if a warning is triggered." OFF)
 
+# The out of tree builds of the compiler and the Fortran runtime
+# must use the same setting of FLANG_RUNTIME_F128_MATH_LIB
+# to be composable. Failure to synchronize this setting may result
+# in linking errors or fatal failures in F128 runtime functions.
+set(FLANG_RUNTIME_F128_MATH_LIB "" CACHE STRING
+  "Specifies the target library used for implementing IEEE-754 128-bit float \
+  math in F18 runtime, e.g. it might be libquadmath for targets where \
+  REAL(16) is mapped to __float128, or libm for targets where REAL(16) \
+  is mapped to long double, etc."
+  )
+
 # Check for a standalone build and configure as appropriate from
 # there.
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
@@ -321,6 +332,12 @@ if (FLANG_REPOSITORY_STRING)
   add_definitions(-DFLANG_REPOSITORY_STRING="${FLANG_REPOSITORY_STRING}")
 endif()
 
+if (FLANG_RUNTIME_F128_MATH_LIB)
+  add_compile_definitions(
+    -DFLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}"
+    )
+endif()
+
 include(TestBigEndian)
 test_big_endian(IS_BIGENDIAN)
 if (IS_BIGENDIAN)
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 3f1e22ecca4ccc..7cb99d61a686ed 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -494,12 +494,13 @@ struct RuntimeFunction {
   fir::runtime::FuncTypeBuilderFunc typeGenerator;
 };
 
-/// Callback type for generating lowering for a math operation.
-using MathGeneratorTy = mlir::Value (*)(fir::FirOpBuilder &, mlir::Location,
-                                        llvm::StringRef, mlir::FunctionType,
-                                        llvm::ArrayRef<mlir::Value>);
-
 struct MathOperation {
+  // Callback type for generating lowering for a math operation.
+  using MathGeneratorTy = mlir::Value (*)(fir::FirOpBuilder &, mlir::Location,
+                                          const MathOperation &,
+                                          mlir::FunctionType,
+                                          llvm::ArrayRef<mlir::Value>);
+
   // Overrides fir::runtime::FuncTypeBuilderFunc to add FirOpBuilder argument.
   using FuncTypeBuilderFunc = mlir::FunctionType (*)(mlir::MLIRContext *,
                                                      fir::FirOpBuilder &);
@@ -681,25 +682,25 @@ getTypesForArgs(llvm::ArrayRef<mlir::Value> args) {
 }
 
 mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc,
-                       llvm::StringRef libFuncName,
+                       const MathOperation &mathOp,
                        mlir::FunctionType libFuncType,
                        llvm::ArrayRef<mlir::Value> args);
 
 template <typename T>
 mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
-                      llvm::StringRef mathLibFuncName,
+                      const MathOperation &mathOp,
                       mlir::FunctionType mathLibFuncType,
                       llvm::ArrayRef<mlir::Value> args);
 
 template <typename T>
 mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
-                             llvm::StringRef mathLibFuncName,
+                             const MathOperation &mathOp,
                              mlir::FunctionType mathLibFuncType,
                              llvm::ArrayRef<mlir::Value> args);
 
 mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder,
                                        mlir::Location loc,
-                                       llvm::StringRef libFuncName,
+                                       const MathOperation &mathOp,
                                        mlir::FunctionType libFuncType,
                                        llvm::ArrayRef<mlir::Value> args);
 
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 446b1529ca0088..151077d81ba14a 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -3247,7 +3247,7 @@ static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder,
                                   fir::FirOpBuilder &builder,
                                   mlir::Location loc, fir::GlobalOp globalOp,
                                   mlir::acc::DataClause clause,
-                                  const std::string declareGlobalName,
+                                  const std::string &declareGlobalName,
                                   bool implicit, std::stringstream &asFortran) {
   GlobalOp declareGlobalOp =
       modBuilder.create<GlobalOp>(loc, declareGlobalName);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index a3536895ca3b7c..3a82be895d37c4 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -657,10 +657,61 @@ static llvm::cl::opt<bool>
                                     "instead of libm complex operations"),
                      llvm::cl::init(false));
 
+/// Return a string containing the given Fortran intrinsic name
+/// with the type of its arguments specified in funcType
+/// surrounded by the given prefix/suffix.
+static std::string
+prettyPrintIntrinsicName(fir::FirOpBuilder &builder, mlir::Location loc,
+                         llvm::StringRef prefix, llvm::StringRef name,
+                         llvm::StringRef suffix, mlir::FunctionType funcType) {
+  std::string output = prefix.str();
+  llvm::raw_string_ostream sstream(output);
+  if (name == "pow") {
+    assert(funcType.getNumInputs() == 2 && "power operator has two arguments");
+    std::string displayName{" ** "};
+    sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc,
+                                        displayName)
+            << displayName
+            << numericMlirTypeToFortran(builder, funcType.getInput(1), loc,
+                                        displayName);
+  } else {
+    sstream << name.upper() << "(";
+    if (funcType.getNumInputs() > 0)
+      sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc,
+                                          name);
+    for (mlir::Type argType : funcType.getInputs().drop_front()) {
+      sstream << ", " << numericMlirTypeToFortran(builder, argType, loc, name);
+    }
+    sstream << ")";
+  }
+  sstream << suffix;
+  return output;
+}
+
+// Generate a call to the Fortran runtime library providing
+// support for 128-bit float math via a third-party library.
+// If the compiler is built without FLANG_RUNTIME_F128_MATH_LIB,
+// this function will report an error.
+static mlir::Value genLibF128Call(fir::FirOpBuilder &builder,
+                                  mlir::Location loc,
+                                  const MathOperation &mathOp,
+                                  mlir::FunctionType libFuncType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+#ifndef FLANG_RUNTIME_F128_MATH_LIB
+  std::string message = prettyPrintIntrinsicName(
+      builder, loc, "compiler is built without support for '", mathOp.key, "'",
+      libFuncType);
+  fir::emitFatalError(loc, message, /*genCrashDiag=*/false);
+#else  // FLANG_RUNTIME_F128_MATH_LIB
+  return genLibCall(builder, loc, mathOp, libFuncType, args);
+#endif // FLANG_RUNTIME_F128_MATH_LIB
+}
+
 mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc,
-                       llvm::StringRef libFuncName,
+                       const MathOperation &mathOp,
                        mlir::FunctionType libFuncType,
                        llvm::ArrayRef<mlir::Value> args) {
+  llvm::StringRef libFuncName = mathOp.runtimeFunc;
   LLVM_DEBUG(llvm::dbgs() << "Generating '" << libFuncName
                           << "' call with type ";
              libFuncType.dump(); llvm::dbgs() << "\n");
@@ -718,7 +769,7 @@ mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc,
 
 mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder,
                                        mlir::Location loc,
-                                       llvm::StringRef libFuncName,
+                                       const MathOperation &mathOp,
                                        mlir::FunctionType libFuncType,
                                        llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2 && "Incorrect #args to genLibSplitComplexArgsCall");
@@ -762,13 +813,12 @@ mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder,
       cplx2, /*isImagPart=*/true);
   splitArgs.push_back(imag2);
 
-  return genLibCall(builder, loc, libFuncName, getSplitComplexArgsType(),
-                    splitArgs);
+  return genLibCall(builder, loc, mathOp, getSplitComplexArgsType(), splitArgs);
 }
 
 template <typename T>
 mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
-                      llvm::StringRef mathLibFuncName,
+                      const MathOperation &mathOp,
                       mlir::FunctionType mathLibFuncType,
                       llvm::ArrayRef<mlir::Value> args) {
   // TODO: we have to annotate the math operations with flags
@@ -791,13 +841,14 @@ mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
   //           can be also lowered to libm calls for "fast" and "relaxed"
   //           modes.
   mlir::Value result;
+  llvm::StringRef mathLibFuncName = mathOp.runtimeFunc;
   if (mathRuntimeVersion == preciseVersion &&
       // Some operations do not have to be lowered as conservative
       // calls, since they do not affect strict FP behavior.
       // For example, purely integer operations like exponentiation
       // with integer operands fall into this class.
       !mathLibFuncName.empty()) {
-    result = genLibCall(builder, loc, mathLibFuncName, mathLibFuncType, args);
+    result = genLibCall(builder, loc, mathOp, mathLibFuncType, args);
   } else {
     LLVM_DEBUG(llvm::dbgs() << "Generating '" << mathLibFuncName
                             << "' operation with type ";
@@ -810,7 +861,7 @@ mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
 
 template <typename T>
 mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
-                             llvm::StringRef mathLibFuncName,
+                             const MathOperation &mathOp,
                              mlir::FunctionType mathLibFuncType,
                              llvm::ArrayRef<mlir::Value> args) {
   mlir::Value result;
@@ -819,11 +870,12 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
 
   // If we have libm functions, we can attempt to generate the more precise
   // version of the complex math operation.
+  llvm::StringRef mathLibFuncName = mathOp.runtimeFunc;
   if (!mathLibFuncName.empty()) {
     // If we enabled MLIR complex or can use approximate operations, we should
     // NOT use libm.
     if (!forceMlirComplex && !canUseApprox) {
-      result = genLibCall(builder, loc, mathLibFuncName, mathLibFuncType, args);
+      result = genLibCall(builder, loc, mathOp, mathLibFuncType, args);
       LLVM_DEBUG(result.dump(); llvm::dbgs() << "\n");
       return result;
     }
@@ -863,6 +915,10 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
 /// TODO: support remaining Fortran math intrinsics.
 ///       See https://gcc.gnu.org/onlinedocs/gcc-12.1.0/gfortran/\
 ///       Intrinsic-Procedures.html for a reference.
+constexpr auto FuncTypeReal16Real16 = genFuncType<Ty::Real<16>, Ty::Real<16>>;
+constexpr auto FuncTypeReal16Complex16 =
+    genFuncType<Ty::Real<16>, Ty::Complex<16>>;
+
 static constexpr MathOperation mathOperations[] = {
     {"abs", "fabsf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::AbsFOp>},
@@ -874,6 +930,7 @@ static constexpr MathOperation mathOperations[] = {
      genComplexMathOp<mlir::complex::AbsOp>},
     {"abs", "cabs", genFuncType<Ty::Real<8>, Ty::Complex<8>>,
      genComplexMathOp<mlir::complex::AbsOp>},
+    {"abs", RTNAME_STRING(CAbsF128), FuncTypeReal16Complex16, genLibF128Call},
     {"acos", "acosf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"acos", "acos", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
     {"acos", "cacosf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
@@ -1110,6 +1167,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::SinOp>},
     {"sin", "sin", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::SinOp>},
+    {"sin", RTNAME_STRING(SinF128), FuncTypeReal16Real16, genLibF128Call},
     {"sin", "csinf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::SinOp>},
     {"sin", "csin", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1122,6 +1180,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::SqrtOp>},
     {"sqrt", "sqrt", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::SqrtOp>},
+    {"sqrt", RTNAME_STRING(SqrtF128), FuncTypeReal16Real16, genLibF128Call},
     {"sqrt", "csqrtf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::SqrtOp>},
     {"sqrt", "csqrt", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1345,27 +1404,9 @@ static void checkPrecisionLoss(llvm::StringRef name,
   // lowering and could be used here. Emit an error and continue
   // generating the code with the narrowing cast so that the user
   // can get a complete list of the problematic intrinsic calls.
-  std::string message("not yet implemented: no math runtime available for '");
-  llvm::raw_string_ostream sstream(message);
-  if (name == "pow") {
-    assert(funcType.getNumInputs() == 2 && "power operator has two arguments");
-    std::string displayName{" ** "};
-    sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc,
-                                        displayName)
-            << displayName
-            << numericMlirTypeToFortran(builder, funcType.getInput(1), loc,
-                                        displayName);
-  } else {
-    sstream << name.upper() << "(";
-    if (funcType.getNumInputs() > 0)
-      sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc,
-                                          name);
-    for (mlir::Type argType : funcType.getInputs().drop_front()) {
-      sstream << ", " << numericMlirTypeToFortran(builder, argType, loc, name);
-    }
-    sstream << ")";
-  }
-  sstream << "'";
+  std::string message = prettyPrintIntrinsicName(
+      builder, loc, "not yet implemented: no math runtime available for '",
+      name, "'", funcType);
   mlir::emitError(loc, message);
 }
 
@@ -1887,7 +1928,7 @@ IntrinsicLibrary::getRuntimeCallGenerator(llvm::StringRef name,
     for (auto [fst, snd] : llvm::zip(actualFuncType.getInputs(), args))
       convertedArguments.push_back(builder.createConvert(loc, fst, snd));
     mlir::Value result = mathOp->funcGenerator(
-        builder, loc, mathOp->runtimeFunc, actualFuncType, convertedArguments);
+        builder, loc, *mathOp, actualFuncType, convertedArguments);
     mlir::Type soughtType = soughtFuncType.getResult(0);
     return builder.createConvert(loc, soughtType, result);
   };
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 19730f7a64337c..7c77bdd79008f1 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -47,7 +47,7 @@ static const llvm::fltSemantics &floatToSemantics(const KindMapping &kindMap,
 }
 
 static void typeTodo(const llvm::fltSemantics *sem, mlir::Location loc,
-                     std::string context) {
+                     const std::string &context) {
   if (sem == &llvm::APFloat::IEEEhalf()) {
     TODO(loc, "COMPLEX(KIND=2): for " + context + " type");
   } else if (sem == &llvm::APFloat::BFloat()) {
diff --git a/flang/lib/Parser/preprocessor.cpp b/flang/lib/Parser/preprocessor.cpp
index 4c2bd31a2ae841..515b8f62daf9ad 100644
--- a/flang/lib/Parser/preprocessor.cpp
+++ b/flang/lib/Parser/preprocessor.cpp
@@ -252,7 +252,7 @@ void Preprocessor::DefineStandardMacros() {
   Define("__LINE__"s, "__LINE__"s);
 }
 
-void Preprocessor::Define(std::string macro, std::string value) {
+void Preprocessor::Define(const std::string &macro, const std::string &value) {
   definitions_.emplace(SaveTokenAsName(macro), Definition{value, allSources_});
 }
 
diff --git a/flang/lib/Parser/preprocessor.h b/flang/lib/Parser/preprocessor.h
index 3b456364944c3d..b61f1577727beb 100644
--- a/flang/lib/Parser/preprocessor.h
+++ b/flang/lib/Parser/preprocessor.h
@@ -70,7 +70,7 @@ class Preprocessor {
   AllSources &allSources() { return allSources_; }
 
   void DefineStandardMacros();
-  void Define(std::string macro, std::string value);
+  void Define(const std::string &macro, const std::string &value);
   void Undefine(std::string macro);
   bool IsNameDefined(const CharBlock &);
   bool IsFunctionLikeDefinition(const CharBlock &);
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index f7f22177a7d0bf..e9b23172ed2e28 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -630,9 +630,11 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
     preventHollerith_ = false;
   } else if (IsLegalInIdentifier(*at_)) {
     int parts{1};
+    const char *afterLast{nullptr};
     do {
       EmitChar(tokens, *at_);
       ++at_, ++column_;
+      afterLast = at_;
       if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
         tokens.CloseToken();
         ++parts;
@@ -640,12 +642,20 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
     } while (IsLegalInIdentifier(*at_));
     if (parts >= 3) {
       // Subtlety: When an identifier is split across three or more continuation
-      // lines, its parts are kept as distinct pp-tokens so that macro
-      // operates on them independently.  This trick accommodates the historic
-      // practice of using line continuation for token pasting after
-      // replacement.
+      // lines (or two continuation lines, immediately preceded or followed
+      // by '&' free form continuation line markers, its parts are kept as
+      // distinct pp-tokens so that macro operates on them independently.
+      // This trick accommodates the historic practice of using line
+      // continuation for token pasting after replacement.
     } else if (parts == 2) {
-      tokens.ReopenLastToken();
+      if ((start > start_ && start[-1] == '&') ||
+          (afterLast < limit_ && (*afterLast == '&' || *afterLast == '\n'))) {
+        // call &                call foo&        call foo&
+        //   &MACRO&      OR       &MACRO&   OR     &MACRO
+        //   &foo(...)             &(...)
+      } else {
+        tokens.ReopenLastToken();
+      }
     }
     if (InFixedFormSource()) {
       SkipSpaces();
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 816227fb3354ff..2db3f9a27d8f4a 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -192,7 +192,7 @@ class DistinguishabilityHelper {
 
 private:
   void SayNotDistinguishable(const Scope &, const SourceName &, GenericKind,
-      const Symbol &, const Symbol &, bool isError);
+      const Symbol &, const Symbol &, bool isHardConflict);
   void AttachDeclaration(parser::Message &, const Scope &, const Symbol &);
 
   SemanticsContext &context_;
@@ -3513,6 +3513,11 @@ void DistinguishabilityHelper::Add(const Symbol &generic, GenericKind kind,
 }
 
 void DistinguishabilityHelper::Check(const Scope &scope) {
+  if (FindModuleFileContaining(scope)) {
+    // Distinguishability was checked when the module was created;
+    // don't let optional warnings then become errors now.
+    return;
+  }
   for (const auto &[name, info] : nameToSpecifics_) {
     for (auto iter1{info.begin()}; iter1 != info.end(); ++iter1) {
       const auto &[ultimate, procInfo]{*iter1};
@@ -3534,15 +3539,21 @@ void DistinguishabilityHelper::Check(const Scope &scope) {
 
 void DistinguishabilityHelper::SayNotDistinguishable(const Scope &scope,
     const SourceName &name, GenericKind kind, const Symbol &proc1,
-    const Symbol &proc2, bool isError) {
-  if (!isError &&
-      !context_.ShouldWarn(
-          common::LanguageFeature::IndistinguishableSpecifics)) {
-    // The rules for distinguishing specific procedures (F'2023 15.4.3.4.5)
-    // are inadequate for some real-world cases like pFUnit.
-    // When there are optional dummy arguments or unlimited polymorphic
-    // dummy data object arguments, the best that we can do is emit an optional
-    // portability warning.
+    const Symbol &proc2, bool isHardConflict) {
+  bool isUseAssociated{!scope.sourceRange().Contains(name)};
+  // The rules for distinguishing specific procedures (F'2023 15.4.3.4.5)
+  // are inadequate for some real-world cases like pFUnit.
+  // When there are optional dummy arguments or unlimited polymorphic
+  // dummy data object arguments, the best that we can do is emit an optional
+  // portability warning.  Also, named generics created by USE association
+  // merging shouldn't receive hard errors for ambiguity.
+  // (Non-named generics might be defined I/O procedures or defined
+  // assignments that need to be used by the runtime.)
+  bool isWarning{!isHardConflict || (isUseAssociated && kind.IsName())};
+  if (isWarning &&
+      (!context_.ShouldWarn(
+           common::LanguageFeature::IndistinguishableSpecifics) ||
+          FindModuleFileContaining(scope))) {
     return;
   }
   std::string name1{proc1.name().ToString()};
@@ -3557,17 +3568,20 @@ void DistinguishabilityHelper::SayNotDistinguishable(const Scope &scope,
     }
   }
   parser::Message *msg;
-  if (scope.sourceRange().Contains(name)) {
+  if (!isUseAssociated) {
+    CHECK(isWarning == !isHardConflict);
     msg = &context_.Say(name,
-        isError
+        isHardConflict
             ? "Generic '%s' may not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_err_en_US
             : "Generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable by the rules in the standard"_port_en_US,
         MakeOpName(name), name1, name2);
   } else {
     msg = &context_.Say(*GetTopLevelUnitContaining(proc1).GetName(),
-        isError
-            ? "USE-associated generic '%s' may not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_err_en_US
-            : "USE-associated generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable by the incomplete rules in the standard"_port_en_US,
+        isHardConflict
+            ? (isWarning
+                      ? "USE-associated generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_warn_en_US
+                      : "USE-associated generic '%s' may not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_err_en_US)
+            : "USE-associated generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable by the rules in the standard"_port_en_US,
         MakeOpName(name), name1, name2);
   }
   AttachDeclaration(*msg, scope, proc1);
diff --git a/flang/lib/Semantics/check-directive-structure.h b/flang/lib/Semantics/check-directive-structure.h
index 829405f99d64c0..97e13c59ac4167 100644
--- a/flang/lib/Semantics/check-directive-structure.h
+++ b/flang/lib/Semantics/check-directive-structure.h
@@ -176,8 +176,8 @@ template <typename D, typename C, typename PC, std::size_t ClauseEnumSize>
 class DirectiveStructureChecker : public virtual BaseChecker {
 protected:
   DirectiveStructureChecker(SemanticsContext &context,
-      std::unordered_map<D, DirectiveClauses<C, ClauseEnumSize>>
-          directiveClausesMap)
+      const std::unordered_map<D, DirectiveClauses<C, ClauseEnumSize>>
+          &directiveClausesMap)
       : context_{context}, directiveClausesMap_(directiveClausesMap) {}
   virtual ~DirectiveStructureChecker() {}
 
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index dfa9da502db0a8..ac89184a7cbffc 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -46,6 +46,23 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   endif ()
   include_directories(BEFORE
     ${FLANG_SOURCE_DIR}/include)
+
+  # The out of tree builds of the compiler and the Fortran runtime
+  # must use the same setting of FLANG_RUNTIME_F128_MATH_LIB
+  # to be composable. Failure to synchronize this setting may result
+  # in linking errors or fatal failures in F128 runtime functions.
+  set(FLANG_RUNTIME_F128_MATH_LIB "" CACHE STRING
+    "Specifies the target library used for implementing IEEE-754 128-bit float \
+    math in F18 runtime, e.g. it might be libquadmath for targets where \
+    REAL(16) is mapped to __float128, or libm for targets where REAL(16) \
+    is mapped to long double, etc."
+    )
+
+  if (NOT FLANG_RUNTIME_F128_MATH_LIB STREQUAL "")
+    add_compile_definitions(
+      -DFLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}"
+      )
+  endif()
 endif()
 
 include(CheckCXXSymbolExists)
@@ -83,6 +100,9 @@ add_definitions(-U_GLIBCXX_ASSERTIONS)
 add_definitions(-U_LIBCPP_ENABLE_ASSERTIONS)
 
 add_subdirectory(FortranMain)
+if (NOT ${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "")
+  add_subdirectory(Float128Math)
+endif()
 
 set(sources
   ISO_Fortran_binding.cpp
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
new file mode 100644
index 00000000000000..f8da4d7ca1a9fe
--- /dev/null
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -0,0 +1,56 @@
+#===-- runtime/Float128Math/CMakeLists.txt ---------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+# FortranFloat128 implements IEEE-754 128-bit float math functions.
+# It is a thin wapper and it currently relies on third-party
+# libraries available for the target.
+# It is distributed as a static library only.
+# Fortran programs/libraries that end up linking any of the provided
+# will have a dependency on the third-party library that is being
+# used for building this FortranFloat128Math library.
+
+if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath" OR
+    ${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "quadmath")
+  check_include_file(quadmath.h FOUND_QUADMATH_HEADER)
+  if(FOUND_QUADMATH_HEADER)
+    add_compile_definitions(HAS_QUADMATHLIB)
+  else()
+    message(FATAL_ERROR
+      "FLANG_RUNTIME_F128_MATH_LIB setting requires quadmath.h "
+      "to be available: ${FLANG_RUNTIME_F128_MATH_LIB}"
+      )
+  endif()
+else()
+  message(FATAL_ERROR
+    "Unsupported third-party library for Fortran F128 math runtime: "
+    "${FLANG_RUNTIME_F128_MATH_LIB}"
+    )
+endif()
+
+set(sources
+  cabs.cpp
+  sin.cpp
+  sqrt.cpp
+  )
+
+include_directories(AFTER "${CMAKE_CURRENT_SOURCE_DIR}/..")
+add_flang_library(FortranFloat128Math STATIC INSTALL_WITH_TOOLCHAIN ${sources})
+
+if (DEFINED MSVC)
+  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+  add_flang_library(FortranFloat128Math.static STATIC INSTALL_WITH_TOOLCHAIN
+    ${sources}
+    )
+  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebug)
+  add_flang_library(FortranFloat128Math.static_dbg STATIC INSTALL_WITH_TOOLCHAIN
+    ${sources}
+    )
+  add_dependencies(FortranFloat128Math FortranFloat128Math.static
+    FortranFloat128Math.static_dbg
+    )
+endif()
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
new file mode 100644
index 00000000000000..63f2bdf8e177ae
--- /dev/null
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -0,0 +1,24 @@
+//===-- runtime/Float128Math/cabs.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// FIXME: the argument should be CppTypeFor<TypeCategory::Complex, 16>,
+// and it should be translated into the underlying library's
+// corresponding complex128 type.
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(ComplexF128 x) {
+  return CAbs<RTNAME(CAbsF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
new file mode 100644
index 00000000000000..91c14b008b5768
--- /dev/null
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -0,0 +1,77 @@
+//===-- runtime/Float128Math/math-entries.h ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
+#define FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
+#include "terminator.h"
+#include "tools.h"
+#include "flang/Common/float128.h"
+#include "flang/Runtime/entry-names.h"
+#include <type_traits>
+
+namespace Fortran::runtime {
+
+// Define a class template to gracefully fail, when
+// there is no specialized template that implements
+// the required function via using the third-party
+// implementation.
+#define DEFINE_FALLBACK(caller) \
+  template <auto F> struct caller { \
+    template <typename... ATs> \
+    [[noreturn]] static std::invoke_result_t<decltype(F), ATs...> invoke( \
+        ATs... args) { \
+      Terminator terminator{__FILE__, __LINE__}; \
+      terminator.Crash("Float128 variant of '%s' is unsupported", #caller); \
+    } \
+  };
+
+// Define template specialization that is calling the third-party
+// implementation. The template is specialized by a function pointer
+// that is the FortranFloat128Math entry point. The signatures
+// of the caller and the callee must match.
+//
+// Defining the specialization for any target library requires
+// adding the generic template via DEFINE_FALLBACK, so that
+// a build with another target library that does not define
+// the same alias can gracefully fail in runtime.
+#define DEFINE_SIMPLE_ALIAS(caller, callee) \
+  template <typename RT, typename... ATs, RT (*p)(ATs...)> struct caller<p> { \
+    static RT invoke(ATs... args) { \
+      static_assert(std::is_invocable_r_v<RT, decltype(callee), ATs...>); \
+      if constexpr (std::is_same_v<RT, void>) { \
+        callee(args...); \
+      } else { \
+        return callee(args...); \
+      } \
+    } \
+  };
+
+// Define fallback callers.
+DEFINE_FALLBACK(CAbs)
+DEFINE_FALLBACK(Sin)
+DEFINE_FALLBACK(Sqrt)
+
+// Define ComplexF128 type that is compatible with
+// the type of results/arguments of libquadmath.
+// TODO: this may need more work for other libraries/compilers.
+#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
+typedef _Complex float __attribute__((mode(TC))) ComplexF128;
+#else
+typedef _Complex float __attribute__((mode(KC))) ComplexF128;
+#endif
+
+#if HAS_QUADMATHLIB
+// Define wrapper callers for libquadmath.
+#include "quadmath.h"
+DEFINE_SIMPLE_ALIAS(CAbs, cabsq)
+DEFINE_SIMPLE_ALIAS(Sin, sinq)
+DEFINE_SIMPLE_ALIAS(Sqrt, sqrtq)
+#endif
+} // namespace Fortran::runtime
+
+#endif // FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
diff --git a/flang/runtime/Float128Math/sin.cpp b/flang/runtime/Float128Math/sin.cpp
new file mode 100644
index 00000000000000..013eb9d119a6a3
--- /dev/null
+++ b/flang/runtime/Float128Math/sin.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/sin.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(SinF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Sin<RTNAME(SinF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/sqrt.cpp b/flang/runtime/Float128Math/sqrt.cpp
new file mode 100644
index 00000000000000..aafbd850ca973a
--- /dev/null
+++ b/flang/runtime/Float128Math/sqrt.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/sqrt.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(SqrtF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Sqrt<RTNAME(SqrtF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 18590567c65eb0..58ca313d9e4454 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -679,6 +679,7 @@ void ExternalFileUnit::Rewind(IoErrorHandler &handler) {
     handler.SignalError(IostatRewindNonSequential,
         "REWIND(UNIT=%d) on non-sequential file", unitNumber());
   } else {
+    DoImpliedEndfile(handler);
     SetPosition(0, handler);
     currentRecordNumber = 1;
     leftTabLimit.reset();
@@ -687,7 +688,6 @@ void ExternalFileUnit::Rewind(IoErrorHandler &handler) {
 }
 
 void ExternalFileUnit::SetPosition(std::int64_t pos, IoErrorHandler &handler) {
-  DoImpliedEndfile(handler);
   frameOffsetInFile_ = pos;
   recordOffsetInFrame_ = 0;
   if (access == Access::Direct) {
@@ -707,6 +707,12 @@ bool ExternalFileUnit::SetStreamPos(
         "POS=%zd is invalid", static_cast<std::intmax_t>(oneBasedPos));
     return false;
   }
+  // A backwards POS= implies truncation after writing, at least in
+  // Intel and NAG.
+  if (static_cast<std::size_t>(oneBasedPos - 1) <
+      frameOffsetInFile_ + recordOffsetInFrame_) {
+    DoImpliedEndfile(handler);
+  }
   SetPosition(oneBasedPos - 1, handler);
   // We no longer know which record we're in.  Set currentRecordNumber to
   // a large value from whence we can both advance and backspace.
diff --git a/flang/test/Lower/Intrinsics/missing-math-runtime.f90 b/flang/test/Lower/Intrinsics/missing-math-runtime.f90
index 98d3abb17f3a8f..ff767ba18faaec 100644
--- a/flang/test/Lower/Intrinsics/missing-math-runtime.f90
+++ b/flang/test/Lower/Intrinsics/missing-math-runtime.f90
@@ -1,10 +1,14 @@
 ! There is no quad math runtime available in lowering
 ! for now. Test that the TODO are emitted correctly.
+! FIXME: the lit config has to flip a feature flag so that
+! the tests can use different checks depending on whether
+! REAL(16) math support is enabled or not.
+! XFAIL: *
 ! RUN: bbc -emit-fir %s -o /dev/null 2>&1 | FileCheck %s
 
  complex(16) :: a
  real(16) :: b
-! CHECK: not yet implemented: no math runtime available for 'ABS(COMPLEX(KIND=16))'
+! CHECK: compiler is built without support for 'ABS(COMPLEX(KIND=16))'
  b = abs(a)
 end
 
diff --git a/flang/test/Preprocessing/pp005.F b/flang/test/Preprocessing/pp005.F
index e4483b404c3673..a8d7394cb12d38 100644
--- a/flang/test/Preprocessing/pp005.F
+++ b/flang/test/Preprocessing/pp005.F
@@ -1,11 +1,11 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: res = 777
+! CHECK: res = (777)
 * KWM split across continuation, implicit padding
       integer, parameter :: KWM = 666
 #define KWM 777
       integer :: res
-      res = KW
-     +M
+      res = (KW
+     +M)
       if (res .eq. 777) then
         print *, 'pp005.F yes'
       else
diff --git a/flang/test/Preprocessing/pp006.F b/flang/test/Preprocessing/pp006.F
index f526ad31733efa..e45dcf9c18e196 100644
--- a/flang/test/Preprocessing/pp006.F
+++ b/flang/test/Preprocessing/pp006.F
@@ -1,12 +1,12 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: res = 777
+! CHECK: res = (777)
 * ditto, but with intervening *comment line
       integer, parameter :: KWM = 666
 #define KWM 777
       integer :: res
-      res = KW
+      res = (KW
 *comment
-     +M
+     +M)
       if (res .eq. 777) then
         print *, 'pp006.F yes'
       else
diff --git a/flang/test/Preprocessing/pp105.F90 b/flang/test/Preprocessing/pp105.F90
index b4f73da6fa24c0..e861e9688d2c54 100644
--- a/flang/test/Preprocessing/pp105.F90
+++ b/flang/test/Preprocessing/pp105.F90
@@ -1,11 +1,11 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: res = 777
+! CHECK: res = (777)
 ! KWM call name split across continuation, with leading &
       integer, parameter :: KWM = 666
 #define KWM 777
       integer :: res
-      res = KW&
-&M
+      res = (KW&
+&M)
       if (res .eq. 777) then
         print *, 'pp105.F90 yes'
       else
diff --git a/flang/test/Preprocessing/pp106.F90 b/flang/test/Preprocessing/pp106.F90
index 556d779048f6c9..a450807f0bd214 100644
--- a/flang/test/Preprocessing/pp106.F90
+++ b/flang/test/Preprocessing/pp106.F90
@@ -1,11 +1,11 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: res = 777
+! CHECK: res = (777)
 ! ditto, with & ! comment
       integer, parameter :: KWM = 666
 #define KWM 777
       integer :: res
-      res = KW& ! comment
-&M
+      res = (KW& ! comment
+&M)
       if (res .eq. 777) then
         print *, 'pp106.F90 yes'
       else
diff --git a/flang/test/Preprocessing/pp134.F90 b/flang/test/Preprocessing/pp134.F90
index 01e7b010d426ec..bc34767224fa03 100644
--- a/flang/test/Preprocessing/pp134.F90
+++ b/flang/test/Preprocessing/pp134.F90
@@ -1,9 +1,23 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: print *, ADC
+! CHECK: print *, ADC, 1
+! CHECK: print *, AD, 1
+! CHECK: print *, DC, 1
+! CHECK: print *, AD
+! CHECK: print *, AB
 #define B D
 implicit none
 real ADC
 print *, A&
   &B&
-  &C
+  &C, 1
+print *, A&
+  &B&
+  &, 1
+print *, &
+  &B&
+  &C, 1
+print *, A&
+  &B
+print *, A&
+  &B ! but not this
 end
diff --git a/flang/test/Semantics/resolve17.f90 b/flang/test/Semantics/resolve17.f90
index 513676fe670a14..770af756d03bc3 100644
--- a/flang/test/Semantics/resolve17.f90
+++ b/flang/test/Semantics/resolve17.f90
@@ -180,7 +180,7 @@ subroutine g()
   end
 end module
 subroutine s9
-  !ERROR: USE-associated generic 'g' may not have specific procedures 'g' and 'g' as their interfaces are not distinguishable
+  !PORTABILITY: USE-associated generic 'g' should not have specific procedures 'g' and 'g' as their interfaces are not distinguishable
   use m9a
   use m9b
 end
diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp
index c4e56a862c8613..52136df10c0b02 100644
--- a/flang/tools/flang-driver/driver.cpp
+++ b/flang/tools/flang-driver/driver.cpp
@@ -130,6 +130,9 @@ int main(int argc, const char **argv) {
                                   llvm::sys::getDefaultTargetTriple(), diags,
                                   "flang LLVM compiler");
   theDriver.setTargetAndMode(targetandMode);
+#ifdef FLANG_RUNTIME_F128_MATH_LIB
+  theDriver.setFlangF128MathLibrary(FLANG_RUNTIME_F128_MATH_LIB);
+#endif
   std::unique_ptr<clang::driver::Compilation> c(
       theDriver.BuildCompilation(args));
   llvm::SmallVector<std::pair<int, const clang::driver::Command *>, 4>
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index f725b1c2394c6a..608ac46034306b 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -73,6 +73,49 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.vsprintf
     libc.src.stdio.vsnprintf
 
+
+    # stdbit.h entrypoints
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_us
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_uc
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_one_ui
+    libc.src.stdbit.stdc_first_leading_one_ul
+    libc.src.stdbit.stdc_first_leading_one_ull
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_us
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_one_uc
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_one_ui
+    libc.src.stdbit.stdc_first_trailing_one_ul
+    libc.src.stdbit.stdc_first_trailing_one_ull
+
     # stdlib.h entrypoints
     libc.src.stdlib.abort
     libc.src.stdlib.abs
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index f725b1c2394c6a..2f299e992be09a 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -73,6 +73,48 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.vsprintf
     libc.src.stdio.vsnprintf
 
+    # stdbit.h entrypoints
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_us
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_uc
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_one_ui
+    libc.src.stdbit.stdc_first_leading_one_ul
+    libc.src.stdbit.stdc_first_leading_one_ull
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_us
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_one_uc
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_one_ui
+    libc.src.stdbit.stdc_first_trailing_one_ul
+    libc.src.stdbit.stdc_first_trailing_one_ull
+
     # stdlib.h entrypoints
     libc.src.stdlib.abort
     libc.src.stdlib.abs
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index b333c6be144627..5224e92bbcc589 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -65,6 +65,48 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.string.strtok_r
     libc.src.string.strxfrm
 
+    # stdbit.h entrypoints
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_us
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_uc
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_one_ui
+    libc.src.stdbit.stdc_first_leading_one_ul
+    libc.src.stdbit.stdc_first_leading_one_ull
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_us
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_one_uc
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_one_ui
+    libc.src.stdbit.stdc_first_trailing_one_ul
+    libc.src.stdbit.stdc_first_trailing_one_ull
+
     # stdlib.h entrypoints
     libc.src.stdlib.abs
     libc.src.stdlib.atoi
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 6e194682df4bfc..8a6c160c099322 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -95,6 +95,41 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_leading_zeros_ui
     libc.src.stdbit.stdc_leading_zeros_ul
     libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_us
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_uc
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_one_ui
+    libc.src.stdbit.stdc_first_leading_one_ul
+    libc.src.stdbit.stdc_first_leading_one_ull
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_us
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_one_uc
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_one_ui
+    libc.src.stdbit.stdc_first_trailing_one_ul
+    libc.src.stdbit.stdc_first_trailing_one_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 9bacfab7b0e5ae..7df19049088867 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -73,6 +73,41 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_leading_zeros_ui
     libc.src.stdbit.stdc_leading_zeros_ul
     libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_us
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_uc
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_one_ui
+    libc.src.stdbit.stdc_first_leading_one_ul
+    libc.src.stdbit.stdc_first_leading_one_ull
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_us
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_one_uc
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_one_ui
+    libc.src.stdbit.stdc_first_trailing_one_ul
+    libc.src.stdbit.stdc_first_trailing_one_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 71ff4bcfc35195..5c8cc7618a9e8c 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -97,6 +97,41 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_leading_zeros_ui
     libc.src.stdbit.stdc_leading_zeros_ul
     libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_us
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_uc
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_one_ui
+    libc.src.stdbit.stdc_first_leading_one_ul
+    libc.src.stdbit.stdc_first_leading_one_ull
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_us
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_one_uc
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_one_ui
+    libc.src.stdbit.stdc_first_trailing_one_ul
+    libc.src.stdbit.stdc_first_trailing_one_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index f5e50262371f26..7d11e7d5c497e0 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -248,6 +248,35 @@ template <typename T, typename = cpp::enable_if_t<cpp::is_unsigned_v<T>>>
   return value == cpp::numeric_limits<T>::max() ? 0 : countr_zero(value) + 1;
 }
 
+/// Count number of 1's aka population count or hamming weight.
+///
+/// Only unsigned integral types are allowed.
+template <typename T, typename = cpp::enable_if_t<cpp::is_unsigned_v<T>>>
+[[nodiscard]] LIBC_INLINE constexpr int count_ones(T value) {
+  int count = 0;
+  for (int i = 0; i != cpp::numeric_limits<T>::digits; ++i)
+    if ((value >> i) & 0x1)
+      ++count;
+  return count;
+}
+#define ADD_SPECIALIZATION(TYPE, BUILTIN)                                      \
+  template <>                                                                  \
+  [[nodiscard]] LIBC_INLINE constexpr int count_ones<TYPE>(TYPE value) {       \
+    return BUILTIN(value);                                                     \
+  }
+ADD_SPECIALIZATION(unsigned char, __builtin_popcount)
+ADD_SPECIALIZATION(unsigned short, __builtin_popcount)
+ADD_SPECIALIZATION(unsigned, __builtin_popcount)
+ADD_SPECIALIZATION(unsigned long, __builtin_popcountl)
+ADD_SPECIALIZATION(unsigned long long, __builtin_popcountll)
+// TODO: 128b specializations?
+#undef ADD_SPECIALIZATION
+
+template <typename T, typename = cpp::enable_if_t<cpp::is_unsigned_v<T>>>
+[[nodiscard]] LIBC_INLINE constexpr int count_zeros(T value) {
+  return count_ones<T>(static_cast<T>(~value));
+}
+
 } // namespace LIBC_NAMESPACE::cpp
 
 #endif // LLVM_LIBC_SRC___SUPPORT_CPP_BIT_H
diff --git a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
index ea1fd68a5fcdfc..fd915373020ec9 100644
--- a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
+++ b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
@@ -161,8 +161,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float large_value = FPBits<float>::max_normal();
-  float small_value = FPBits<float>::min_normal();
+  float large_value = FPBits<float>::max_normal().get_val();
+  float small_value = FPBits<float>::min_normal().get_val();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("ldr  s0, %0\n\t"
                          "ldr  s1, %1\n\t"
@@ -277,8 +277,8 @@ LIBC_INLINE int set_env(const fenv_t *envp) {
     return 0;
   }
   const FEnv::FPState *state = reinterpret_cast<const FEnv::FPState *>(envp);
-  FEnv::set_control_word(state->ControlWord);
-  FEnv::set_status_word(state->StatusWord);
+  FEnv::set_control_word(static_cast<uint32_t>(state->ControlWord));
+  FEnv::set_status_word(static_cast<uint32_t>(state->StatusWord));
   return 0;
 }
 
diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp
index 5d1f451776a5fe..115a5d505c4b7a 100644
--- a/libc/test/src/__support/CPP/bit_test.cpp
+++ b/libc/test/src/__support/CPP/bit_test.cpp
@@ -232,4 +232,17 @@ TYPED_TEST(LlvmLibcBitTest, FirstTrailingOne, UnsignedTypes) {
     EXPECT_EQ(first_trailing_one<T>(T(1) << i), i + 1);
 }
 
+TYPED_TEST(LlvmLibcBitTest, CountZeros, UnsignedTypes) {
+  EXPECT_EQ(count_zeros(T(0)), cpp::numeric_limits<T>::digits);
+  for (int i = 0; i != cpp::numeric_limits<T>::digits; ++i)
+    EXPECT_EQ(count_zeros<T>(cpp::numeric_limits<T>::max() >> i), i);
+}
+
+TYPED_TEST(LlvmLibcBitTest, CountOnes, UnsignedTypes) {
+  EXPECT_EQ(count_ones(T(0)), 0);
+  for (int i = 0; i != cpp::numeric_limits<T>::digits; ++i)
+    EXPECT_EQ(count_ones<T>(cpp::numeric_limits<T>::max() >> i),
+              cpp::numeric_limits<T>::digits - i);
+}
+
 } // namespace LIBC_NAMESPACE::cpp
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index fcca8c42b29b71..691ebfc074320f 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -310,6 +310,7 @@ struct Config {
   bool zInitfirst;
   bool zInterpose;
   bool zKeepTextSectionPrefix;
+  bool zLrodataAfterBss;
   bool zNodefaultlib;
   bool zNodelete;
   bool zNodlopen;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 01ca2e0ab0ff96..d94b8939de15bd 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1436,6 +1436,8 @@ static void readConfigs(opt::InputArgList &args) {
   config->zInterpose = hasZOption(args, "interpose");
   config->zKeepTextSectionPrefix = getZFlag(
       args, "keep-text-section-prefix", "nokeep-text-section-prefix", false);
+  config->zLrodataAfterBss =
+      getZFlag(args, "lrodata-after-bss", "nolrodata-after-bss", false);
   config->zNodefaultlib = hasZOption(args, "nodefaultlib");
   config->zNodelete = hasZOption(args, "nodelete");
   config->zNodlopen = hasZOption(args, "nodlopen");
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index bada394aa30d7d..b6bdc350bc0dd1 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -537,9 +537,11 @@ SmallVector<EhFrameSection::FdeData, 0> EhFrameSection::getFdeData() const {
     for (EhSectionPiece *fde : rec->fdes) {
       uint64_t pc = getFdePc(buf, fde->outputOff, enc);
       uint64_t fdeVA = getParent()->addr + fde->outputOff;
-      if (!isInt<32>(pc - va))
-        fatal(toString(fde->sec) + ": PC offset is too large: 0x" +
-              Twine::utohexstr(pc - va));
+      if (!isInt<32>(pc - va)) {
+        errorOrWarn(toString(fde->sec) + ": PC offset is too large: 0x" +
+                    Twine::utohexstr(pc - va));
+        continue;
+      }
       ret.push_back({uint32_t(pc - va), uint32_t(fdeVA - va)});
     }
   }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 5b7dfd358e764e..0bbf43ddf694aa 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -911,11 +911,12 @@ enum RankFlags {
   RF_NOT_ALLOC = 1 << 26,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
   RF_NOT_SPECIAL = 1 << 17,
-  RF_WRITE = 1 << 16,
-  RF_EXEC_WRITE = 1 << 15,
-  RF_EXEC = 1 << 14,
-  RF_RODATA = 1 << 13,
-  RF_LARGE = 1 << 12,
+  RF_LARGE_ALT = 1 << 15,
+  RF_WRITE = 1 << 14,
+  RF_EXEC_WRITE = 1 << 13,
+  RF_EXEC = 1 << 12,
+  RF_RODATA = 1 << 11,
+  RF_LARGE = 1 << 10,
   RF_NOT_RELRO = 1 << 9,
   RF_NOT_TLS = 1 << 8,
   RF_BSS = 1 << 7,
@@ -974,8 +975,14 @@ static unsigned getSectionRank(OutputSection &osec) {
     if (osec.type == SHT_PROGBITS)
       rank |= RF_RODATA;
     // Among PROGBITS sections, place .lrodata further from .text.
-    if (!(osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64))
-      rank |= RF_LARGE;
+    // For -z lrodata-after-bss, place .lrodata after .lbss like GNU ld. This
+    // layout has one extra PT_LOAD, but alleviates relocation overflow
+    // pressure for absolute relocations referencing small data from -fno-pic
+    // relocatable files.
+    if (osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64)
+      rank |= config->zLrodataAfterBss ? RF_LARGE_ALT : 0;
+    else
+      rank |= config->zLrodataAfterBss ? 0 : RF_LARGE;
   } else if (isExec) {
     rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
   } else {
@@ -988,10 +995,15 @@ static unsigned getSectionRank(OutputSection &osec) {
       osec.relro = true;
     else
       rank |= RF_NOT_RELRO;
-    // Place .ldata and .lbss after .bss. Making .bss closer to .text alleviates
-    // relocation overflow pressure.
-    if (osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64)
-      rank |= RF_LARGE;
+    // Place .ldata and .lbss after .bss. Making .bss closer to .text
+    // alleviates relocation overflow pressure.
+    // For -z lrodata-after-bss, place .lbss/.lrodata/.ldata after .bss.
+    // .bss/.lbss being adjacent reuses the NOBITS size optimization.
+    if (osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64) {
+      rank |= config->zLrodataAfterBss
+                  ? (osec.type == SHT_NOBITS ? 1 : RF_LARGE_ALT)
+                  : RF_LARGE;
+    }
   }
 
   // Within TLS sections, or within other RelRo sections, or within non-RelRo
@@ -1103,7 +1115,7 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
   }
 
   PhdrEntry *last = nullptr;
-  PhdrEntry *lastRO = nullptr;
+  OutputSection *lastRO = nullptr;
   auto isLarge = [](OutputSection *osec) {
     return config->emachine == EM_X86_64 && osec->flags & SHF_X86_64_LARGE;
   };
@@ -1112,17 +1124,18 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
       if (p->p_type != PT_LOAD)
         continue;
       last = p;
-      if (!(p->p_flags & PF_W))
-        lastRO = p;
+      if (!(p->p_flags & PF_W) && p->lastSec && !isLarge(p->lastSec))
+        lastRO = p->lastSec;
     }
   }
 
   if (lastRO) {
-    // _etext is the first location after the last read-only loadable segment.
+    // _etext is the first location after the last read-only loadable segment
+    // that does not contain large sections.
     if (ElfSym::etext1)
-      ElfSym::etext1->section = lastRO->lastSec;
+      ElfSym::etext1->section = lastRO;
     if (ElfSym::etext2)
-      ElfSym::etext2->section = lastRO->lastSec;
+      ElfSym::etext2->section = lastRO;
   }
 
   if (last) {
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index efe6c256ca9d01..8dc402b7d7f49f 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -791,6 +791,9 @@ flag to indicate to the runtime linker that the object is an interposer.
 During symbol resolution interposers are searched after the application
 but before other dependencies.
 .Pp
+.It Cm lrodata-after-bss
+Place .lrodata after .bss.
+.Pp
 .It Cm muldefs
 Do not error if a symbol is defined multiple times.
 The first definition will be used.
diff --git a/lld/test/ELF/eh-frame-pcrel-overflow.s b/lld/test/ELF/eh-frame-pcrel-overflow.s
index 78e804768dad63..3dfcf9ee1a7f9a 100644
--- a/lld/test/ELF/eh-frame-pcrel-overflow.s
+++ b/lld/test/ELF/eh-frame-pcrel-overflow.s
@@ -4,7 +4,9 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %p/Inputs/eh-frame-pcrel-overflow.s -o %t1.o
 # RUN: ld.lld --eh-frame-hdr -Ttext=0x90000000 %t.o -o /dev/null
 # RUN: not ld.lld --eh-frame-hdr %t.o %t1.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld --eh-frame-hdr %t.o %t1.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=WARN
 # CHECK: error: {{.*}}.o:(.eh_frame): PC offset is too large: 0x90001054
+# WARN: warning: {{.*}}.o:(.eh_frame): PC offset is too large: 0x90001054
 
 .text
 .global _start
diff --git a/lld/test/ELF/lto/codemodel.ll b/lld/test/ELF/lto/codemodel.ll
index a35f87729411d7..cf7d0e409ec4b1 100644
--- a/lld/test/ELF/lto/codemodel.ll
+++ b/lld/test/ELF/lto/codemodel.ll
@@ -2,8 +2,8 @@
 ; RUN: llvm-as %s -o %t.o
 ; RUN: ld.lld %t.o -o %ts -mllvm -code-model=small
 ; RUN: ld.lld %t.o -o %tl -mllvm -code-model=large
-; RUN: llvm-objdump --no-print-imm-hex -d %ts | FileCheck %s --check-prefix=CHECK-SMALL
-; RUN: llvm-objdump --no-print-imm-hex -d %tl | FileCheck %s --check-prefix=CHECK-LARGE
+; RUN: llvm-objdump -d %ts | FileCheck %s --check-prefix=CHECK-SMALL
+; RUN: llvm-objdump -d %tl | FileCheck %s --check-prefix=CHECK-LARGE
 
 target triple = "x86_64-unknown-linux-gnu"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -13,8 +13,8 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 define ptr @_start() nounwind readonly {
 entry:
 ; CHECK-SMALL-LABEL:  <_start>:
-; CHECK-SMALL: movl    $2097440, %eax
+; CHECK-SMALL: movl    ${{.*}}, %eax
 ; CHECK-LARGE-LABEL: <_start>:
-; CHECK-LARGE: movabsq $2097440, %rax
+; CHECK-LARGE: movabsq ${{.*}}, %rax
     ret ptr @data
 }
diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s
index 0ba60539389397..b03d3e6c2b9990 100644
--- a/lld/test/ELF/x86-64-section-layout.s
+++ b/lld/test/ELF/x86-64-section-layout.s
@@ -12,9 +12,12 @@
 # RUN: ld.lld --section-start=.note=0x200300 a1.o -o a1
 # RUN: llvm-readelf -S -sX a1 | FileCheck %s --check-prefix=CHECK1
 
-# RUN: ld.lld -T b.lds -z norelro a.o -o b
+# RUN: ld.lld -T b.lds -z norelro a.o -z lrodata-after-bss -z nolrodata-after-bss -o b --fatal-warnings
 # RUN: llvm-readelf -S -l b | FileCheck %s --check-prefix=CHECK2
 
+# RUN: ld.lld --section-start=.note=0x200300 a.o -z lrodata-after-bss -o a3
+# RUN: llvm-readelf -S -l -sX a3 | FileCheck %s --check-prefix=CHECK3
+
 # CHECK:       Name              Type            Address          Off    Size   ES Flg Lk Inf Al
 # CHECK-NEXT:                    NULL            0000000000000000 000000 000000 00      0   0  0
 # CHECK-NEXT:  .note             NOTE            0000000000200300 000300 000001 00   A  0   0  1
@@ -80,6 +83,39 @@
 # CHECK2-NEXT:   LOAD  0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x002a06 RW  0x1000
 # CHECK2-NEXT:   TLS   0x000305 0x0000000000200305 0x0000000000200305 0x000001 0x000003 R   0x1
 
+# CHECK3:       Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK3-NEXT:                    NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK3-NEXT:  .note             NOTE            0000000000200300 000300 000001 00   A  0   0  1
+# CHECK3-NEXT:  .rodata           PROGBITS        0000000000200301 000301 000001 00   A  0   0  1
+# CHECK3-NEXT:  .text             PROGBITS        0000000000201304 000304 000001 00  AX  0   0  4
+# CHECK3-NEXT:  .tdata            PROGBITS        0000000000202305 000305 000001 00 WAT  0   0  1
+# CHECK3-NEXT:  .tbss             NOBITS          0000000000202306 000306 000002 00 WAT  0   0  1
+# CHECK3-NEXT:  .relro_padding    NOBITS          0000000000202306 000306 000cfa 00  WA  0   0  1
+# CHECK3-NEXT:  .data             PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
+# CHECK3-NEXT:  .bss              NOBITS          0000000000203307 000307 001800 00  WA  0   0  1
+## We spend (size(.bss) + size(.lbss)) % MAXPAGESIZE bytes.
+# CHECK3-NEXT:  .lbss             NOBITS          0000000000204b07 000307 001201 00 WAl  0   0  1
+# CHECK3-NEXT:  .lrodata          PROGBITS        0000000000206d08 000d08 000002 00  Al  0   0  1
+# CHECK3-NEXT:  .ldata            PROGBITS        0000000000207d0a 000d0a 000002 00 WAl  0   0  1
+# CHECK3-NEXT:  .ldata2           PROGBITS        0000000000207d0c 000d0c 000001 00 WAl  0   0  1
+# CHECK3-NEXT:  .comment          PROGBITS        0000000000000000 000d0d {{.*}} 01  MS  0   0  1
+
+# CHECK3:       Program Headers:
+# CHECK3-NEXT:    Type  Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK3-NEXT:    PHDR  0x000040 0x0000000000200040 0x0000000000200040 {{.*}}   {{.*}}   R   0x8
+# CHECK3-NEXT:    LOAD  0x000000 0x0000000000200000 0x0000000000200000 0x000302 0x000302 R   0x1000
+# CHECK3-NEXT:    LOAD  0x000304 0x0000000000201304 0x0000000000201304 0x000001 0x000001 R E 0x1000
+# CHECK3-NEXT:    LOAD  0x000305 0x0000000000202305 0x0000000000202305 0x000001 0x000cfb RW  0x1000
+# CHECK3-NEXT:    LOAD  0x000306 0x0000000000203306 0x0000000000203306 0x000001 0x002a02 RW  0x1000
+# CHECK3-NEXT:    LOAD  0x000d08 0x0000000000206d08 0x0000000000206d08 0x000002 0x000002 R   0x1000
+# CHECK3-NEXT:    LOAD  0x000d0a 0x0000000000207d0a 0x0000000000207d0a 0x000003 0x000003 RW  0x1000
+# CHECK3-NEXT:    TLS   0x000305 0x0000000000202305 0x0000000000202305 0x000001 0x000003 R   0x1
+
+# CHECK3:       0000000000201304     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _start
+# CHECK3-NEXT:  0000000000201305     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _etext
+# CHECK3-NEXT:  0000000000203307     0 NOTYPE  GLOBAL DEFAULT [[#]] (.data)   _edata
+# CHECK3-NEXT:  0000000000207d0d     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _end
+
 #--- a.s
 .globl _start, _etext, _edata, _end
 _start:
diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index b99de56f534469..a326c6dc38a37a 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -242,6 +242,13 @@ class CommandObject : public std::enable_shared_from_this<CommandObject> {
   ///    The completion request that needs to be answered.
   virtual void HandleCompletion(CompletionRequest &request);
 
+  /// The default version handles argument definitions that have only one
+  /// argument type, and use one of the argument types that have an entry in
+  /// the CommonCompletions.  Override this if you have a more complex
+  /// argument setup.
+  /// FIXME: we should be able to extend this to more complex argument
+  /// definitions provided we have completers for all the argument types.
+  ///
   /// The input array contains a parsed version of the line.
   ///
   /// We've constructed the map of options and their arguments as well if that
@@ -251,7 +258,7 @@ class CommandObject : public std::enable_shared_from_this<CommandObject> {
   ///    The completion request that needs to be answered.
   virtual void
   HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) {}
+                           OptionElementVector &opt_element_vector);
 
   bool HelpTextContainsWord(llvm::StringRef search_word,
                             bool search_short_help = true,
diff --git a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
index d0cf54c31ca73f..9248e2ac814461 100644
--- a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
+++ b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
@@ -243,7 +243,7 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = {
     { lldb::eArgTypeLogCategory, "log-category", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a category within a log channel, e.g. all (try \"log list\" to see a list of all channels and their categories." },
     { lldb::eArgTypeLogChannel, "log-channel", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a log channel, e.g. process.gdb-remote (try \"log list\" to see a list of all channels and their categories)." },
     { lldb::eArgTypeMethod, "method", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A C++ method name." },
-    { lldb::eArgTypeName, "name", lldb::eTypeCategoryNameCompletion, {}, { nullptr, false }, "Help text goes here." },
+    { lldb::eArgTypeName, "name", lldb::eTypeCategoryNameCompletion, {}, { nullptr, false }, "The name of a type category." },
     { lldb::eArgTypeNewPathPrefix, "new-path-prefix", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Help text goes here." },
     { lldb::eArgTypeNumLines, "num-lines", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The number of lines to use." },
     { lldb::eArgTypeNumberPerLine, "number-per-line", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The number of items per line to display." },
@@ -260,9 +260,9 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = {
     { lldb::eArgTypePythonFunction, "python-function", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a Python function." },
     { lldb::eArgTypePythonScript, "python-script", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Source code written in Python." },
     { lldb::eArgTypeQueueName, "queue-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of the thread queue." },
-    { lldb::eArgTypeRegisterName, "register-name", lldb::CompletionType::eNoCompletion, {}, { RegisterNameHelpTextCallback, true }, nullptr },
+    { lldb::eArgTypeRegisterName, "register-name", lldb::CompletionType::eRegisterCompletion, {}, { RegisterNameHelpTextCallback, true }, nullptr },
     { lldb::eArgTypeRegularExpression, "regular-expression", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A POSIX-compliant extended regular expression." },
-    { lldb::eArgTypeRunArgs, "run-args", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Arguments to be passed to the target program when it starts executing." },
+    { lldb::eArgTypeRunArgs, "run-args", lldb::CompletionType::eDiskFileCompletion, {}, { nullptr, false }, "Arguments to be passed to the target program when it starts executing." },
     { lldb::eArgTypeRunMode, "run-mode", lldb::CompletionType::eNoCompletion, g_running_mode, { nullptr, false }, "Help text goes here." },
     { lldb::eArgTypeScriptedCommandSynchronicity, "script-cmd-synchronicity", lldb::CompletionType::eNoCompletion, g_script_synchro_type, { nullptr, false }, "The synchronicity to use to run scripted commands with regard to LLDB event system." },
     { lldb::eArgTypeScriptLang, "script-language", lldb::CompletionType::eNoCompletion, g_script_option_enumeration, { nullptr, false }, "The scripting language to be used for script-based commands.  Supported languages are python and lua." },
@@ -270,21 +270,21 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = {
     { lldb::eArgTypeSelector, "selector", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "An Objective-C selector name." },
     { lldb::eArgTypeSettingIndex, "setting-index", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "An index into a settings variable that is an array (try 'settings list' to see all the possible settings variables and their types)." },
     { lldb::eArgTypeSettingKey, "setting-key", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A key into a settings variables that is a dictionary (try 'settings list' to see all the possible settings variables and their types)." },
-    { lldb::eArgTypeSettingPrefix, "setting-prefix", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable up to a dot ('.'), e.g. 'target.process.'" },
-    { lldb::eArgTypeSettingVariableName, "setting-variable-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable.  Type 'settings list' to see a complete list of such variables." },
-    { lldb::eArgTypeShlibName, "shlib-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a shared library." },
+    { lldb::eArgTypeSettingPrefix, "setting-prefix", lldb::CompletionType::eSettingsNameCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable up to a dot ('.'), e.g. 'target.process.'" },
+    { lldb::eArgTypeSettingVariableName, "setting-variable-name", lldb::CompletionType::eSettingsNameCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable.  Type 'settings list' to see a complete list of such variables." },
+    { lldb::eArgTypeShlibName, "shlib-name", lldb::CompletionType::eDiskFileCompletion, {}, { nullptr, false }, "The name of a shared library." },
     { lldb::eArgTypeSourceFile, "source-file", lldb::eSourceFileCompletion, {}, { nullptr, false }, "The name of a source file.." },
     { lldb::eArgTypeSortOrder, "sort-order", lldb::CompletionType::eNoCompletion, g_sort_option_enumeration, { nullptr, false }, "Specify a sort order when dumping lists." },
     { lldb::eArgTypeStartAddress, "start-address", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Help text goes here." },
     { lldb::eArgTypeSummaryString, "summary-string", lldb::CompletionType::eNoCompletion, {}, { SummaryStringHelpTextCallback, true }, nullptr },
     { lldb::eArgTypeSymbol, "symbol", lldb::eSymbolCompletion, {}, { nullptr, false }, "Any symbol name (function name, variable, argument, etc.)" },
-    { lldb::eArgTypeThreadID, "thread-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Thread ID number." },
-    { lldb::eArgTypeThreadIndex, "thread-index", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Index into the process' list of threads." },
+    { lldb::eArgTypeThreadID, "thread-id", lldb::CompletionType::eThreadIndexCompletion, {}, { nullptr, false }, "Thread ID number." },
+    { lldb::eArgTypeThreadIndex, "thread-index", lldb::CompletionType::eThreadIndexCompletion, {}, { nullptr, false }, "Index into the process' list of threads." },
     { lldb::eArgTypeThreadName, "thread-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The thread's name." },
     { lldb::eArgTypeTypeName, "type-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A type name." },
     { lldb::eArgTypeUnsignedInteger, "unsigned-integer", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "An unsigned integer." },
     { lldb::eArgTypeUnixSignal, "unix-signal", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A valid Unix signal name or number (e.g. SIGKILL, KILL or 9)." },
-    { lldb::eArgTypeVarName, "variable-name", lldb::CompletionType::eNoCompletion, {} ,{ nullptr, false }, "The name of a variable in your program." },
+    { lldb::eArgTypeVarName, "variable-name", lldb::CompletionType::eVariablePathCompletion, {} ,{ nullptr, false }, "The name of a variable in your program." },
     { lldb::eArgTypeValue, "value", lldb::CompletionType::eNoCompletion, g_dependents_enumeration, { nullptr, false }, "A value could be anything, depending on where and how it is used." },
     { lldb::eArgTypeWidth, "width", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Help text goes here." },
     { lldb::eArgTypeNone, "none", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "No help available for this." },
@@ -302,8 +302,11 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = {
     { lldb::eArgTypeRecognizerID, "frame-recognizer-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The ID for a stack frame recognizer." },
     { lldb::eArgTypeConnectURL, "process-connect-url", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A URL-style specification for a remote connection." },
     { lldb::eArgTypeTargetID, "target-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The index ID for an lldb Target." },
-    { lldb::eArgTypeStopHookID, "stop-hook-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The ID you receive when you create a stop-hook." },
+    { lldb::eArgTypeStopHookID, "stop-hook-id", lldb::CompletionType::eStopHookIDCompletion, {}, { nullptr, false }, "The ID you receive when you create a stop-hook." },
     { lldb::eArgTypeCompletionType, "completion-type", lldb::CompletionType::eNoCompletion, g_completion_type, { nullptr, false }, "The completion type to use when adding custom commands. If none is specified, the command won't use auto-completion." },
+    { lldb::eArgTypeRemotePath, "remote-path", lldb::CompletionType::eRemoteDiskFileCompletion, {}, { nullptr, false }, "A path on the system managed by the current platform." },
+    { lldb::eArgTypeRemoteFilename, "remote-filename", lldb::CompletionType::eRemoteDiskFileCompletion, {}, { nullptr, false }, "A file on the system managed by the current platform." },
+    { lldb::eArgTypeModule, "module", lldb::CompletionType::eModuleCompletion, {}, { nullptr, false }, "The name of a module loaded into the current target." },
     // clang-format on
 };
 
diff --git a/lldb/include/lldb/Utility/FileSpecList.h b/lldb/include/lldb/Utility/FileSpecList.h
index 49edc667ddd5b6..6eb3bb9971f13a 100644
--- a/lldb/include/lldb/Utility/FileSpecList.h
+++ b/lldb/include/lldb/Utility/FileSpecList.h
@@ -238,6 +238,10 @@ class FileSpecList {
   const_iterator begin() const { return m_files.begin(); }
   const_iterator end() const { return m_files.end(); }
 
+  llvm::iterator_range<const_iterator> files() const {
+    return llvm::make_range(begin(), end());
+  }
+
 protected:
   collection m_files; ///< A collection of FileSpec objects.
 };
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 4640533047833b..85769071dae785 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -651,6 +651,9 @@ enum CommandArgumentType {
   eArgTypeTargetID,
   eArgTypeStopHookID,
   eArgTypeCompletionType,
+  eArgTypeRemotePath,
+  eArgTypeRemoteFilename,
+  eArgTypeModule,
   eArgTypeLastArg // Always keep this entry as the last entry in this
                   // enumeration!!
 };
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index b7cd65059b2214..7c459bdaf38022 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -63,13 +63,6 @@ class CommandObjectCommandsSource : public CommandObjectParsed {
     return std::string("");
   }
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
   Options *GetOptions() override { return &m_options; }
 
 protected:
@@ -1968,13 +1961,6 @@ class CommandObjectCommandsScriptImport : public CommandObjectParsed {
 
   ~CommandObjectCommandsScriptImport() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
   Options *GetOptions() override { return &m_options; }
 
 protected:
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index 695f3d7931cd0a..fb2cc106ffd2dd 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -52,12 +52,6 @@ CommandObjectDWIMPrint::CommandObjectDWIMPrint(CommandInterpreter &interpreter)
 
 Options *CommandObjectDWIMPrint::GetOptions() { return &m_option_group; }
 
-void CommandObjectDWIMPrint::HandleArgumentCompletion(
-    CompletionRequest &request, OptionElementVector &opt_element_vector) {
-  lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-      GetCommandInterpreter(), lldb::eVariablePathCompletion, request, nullptr);
-}
-
 void CommandObjectDWIMPrint::DoExecute(StringRef command,
                                        CommandReturnObject &result) {
   m_option_group.NotifyOptionParsingStarting(&m_exe_ctx);
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.h b/lldb/source/Commands/CommandObjectDWIMPrint.h
index d868f8964c2ac5..01ba9c225e3301 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.h
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.h
@@ -39,10 +39,6 @@ class CommandObjectDWIMPrint : public CommandObjectRaw {
 
   bool WantsCompletion() override { return true; }
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override;
-
 private:
   void DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
 
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index a4d3fb66e8b552..f092d54ffe9932 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -286,16 +286,6 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
 
   ~CommandObjectFrameSelect() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    if (request.GetCursorIndex() != 0)
-      return;
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eFrameIndexCompletion, request, nullptr);
-  }
-
   Options *GetOptions() override { return &m_options; }
 
 protected:
@@ -446,15 +436,6 @@ may even involve JITing and running code in the target program.)");
 
   Options *GetOptions() override { return &m_option_group; }
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    // Arguments are the standard source file completer.
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eVariablePathCompletion, request,
-        nullptr);
-  }
-
 protected:
   llvm::StringRef GetScopeString(VariableSP var_sp) {
     if (!var_sp)
diff --git a/lldb/source/Commands/CommandObjectPlatform.cpp b/lldb/source/Commands/CommandObjectPlatform.cpp
index 790f1dbb475358..b25c391bd4faa2 100644
--- a/lldb/source/Commands/CommandObjectPlatform.cpp
+++ b/lldb/source/Commands/CommandObjectPlatform.cpp
@@ -418,7 +418,7 @@ class CommandObjectPlatformMkDir : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "platform mkdir",
                             "Make a new directory on the remote end.", nullptr,
                             0) {
-    CommandArgumentData thread_arg{eArgTypePath, eArgRepeatPlain};
+    CommandArgumentData thread_arg{eArgTypeRemotePath, eArgRepeatPlain};
     m_arguments.push_back({thread_arg});
   }
 
@@ -467,21 +467,12 @@ class CommandObjectPlatformFOpen : public CommandObjectParsed {
   CommandObjectPlatformFOpen(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "platform file open",
                             "Open a file on the remote end.", nullptr, 0) {
-    CommandArgumentData path_arg{eArgTypePath, eArgRepeatPlain};
+    CommandArgumentData path_arg{eArgTypeRemotePath, eArgRepeatPlain};
     m_arguments.push_back({path_arg});
   }
 
   ~CommandObjectPlatformFOpen() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    if (request.GetCursorIndex() == 0)
-      lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-          GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request,
-          nullptr);
-  }
-
   void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
@@ -795,7 +786,7 @@ class CommandObjectPlatformGetFile : public CommandObjectParsed {
     CommandArgumentData file_arg_remote, file_arg_host;
 
     // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeFilename;
+    file_arg_remote.arg_type = eArgTypeRemoteFilename;
     file_arg_remote.arg_repetition = eArgRepeatPlain;
     // There is only one variant this argument could be; put it into the
     // argument entry.
@@ -876,7 +867,7 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed {
     CommandArgumentData file_arg_remote;
 
     // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeFilename;
+    file_arg_remote.arg_type = eArgTypeRemoteFilename;
     file_arg_remote.arg_repetition = eArgRepeatPlain;
     // There is only one variant this argument could be; put it into the
     // argument entry.
@@ -888,17 +879,6 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed {
 
   ~CommandObjectPlatformGetSize() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    if (request.GetCursorIndex() != 0)
-      return;
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request,
-        nullptr);
-  }
-
   void DoExecute(Args &args, CommandReturnObject &result) override {
     // If the number of arguments is incorrect, issue an error message.
     if (args.GetArgumentCount() != 1) {
@@ -946,7 +926,7 @@ class CommandObjectPlatformGetPermissions : public CommandObjectParsed {
     CommandArgumentData file_arg_remote;
 
     // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeFilename;
+    file_arg_remote.arg_type = eArgTypeRemoteFilename;
     file_arg_remote.arg_repetition = eArgRepeatPlain;
     // There is only one variant this argument could be; put it into the
     // argument entry.
@@ -958,17 +938,6 @@ class CommandObjectPlatformGetPermissions : public CommandObjectParsed {
 
   ~CommandObjectPlatformGetPermissions() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    if (request.GetCursorIndex() != 0)
-      return;
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request,
-        nullptr);
-  }
-
   void DoExecute(Args &args, CommandReturnObject &result) override {
     // If the number of arguments is incorrect, issue an error message.
     if (args.GetArgumentCount() != 1) {
@@ -1015,7 +984,7 @@ class CommandObjectPlatformFileExists : public CommandObjectParsed {
     CommandArgumentData file_arg_remote;
 
     // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeFilename;
+    file_arg_remote.arg_type = eArgTypeRemoteFilename;
     file_arg_remote.arg_repetition = eArgRepeatPlain;
     // There is only one variant this argument could be; put it into the
     // argument entry.
@@ -1027,17 +996,6 @@ class CommandObjectPlatformFileExists : public CommandObjectParsed {
 
   ~CommandObjectPlatformFileExists() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    if (request.GetCursorIndex() != 0)
-      return;
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request,
-        nullptr);
-  }
-
   void DoExecute(Args &args, CommandReturnObject &result) override {
     // If the number of arguments is incorrect, issue an error message.
     if (args.GetArgumentCount() != 1) {
@@ -1080,7 +1038,7 @@ class CommandObjectPlatformPutFile : public CommandObjectParsed {
 
     Omitting the destination places the file in the platform working directory.)");
     CommandArgumentData source_arg{eArgTypePath, eArgRepeatPlain};
-    CommandArgumentData path_arg{eArgTypePath, eArgRepeatOptional};
+    CommandArgumentData path_arg{eArgTypeRemotePath, eArgRepeatOptional};
     m_arguments.push_back({source_arg});
     m_arguments.push_back({path_arg});
   }
@@ -1139,6 +1097,16 @@ class CommandObjectPlatformProcessLaunch : public CommandObjectParsed {
     m_arguments.push_back({run_arg_arg});
   }
 
+  void
+  HandleArgumentCompletion(CompletionRequest &request,
+                           OptionElementVector &opt_element_vector) override {
+    // I didn't make a type for RemoteRunArgs, but since we're going to run
+    // this on the remote system we should use the remote completer.
+    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
+        GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request,
+        nullptr);
+  }
+
   ~CommandObjectPlatformProcessLaunch() override = default;
 
   Options *GetOptions() override { return &m_all_options; }
@@ -1552,13 +1520,6 @@ class CommandObjectPlatformProcessInfo : public CommandObjectParsed {
 
   ~CommandObjectPlatformProcessInfo() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eProcessIDCompletion, request, nullptr);
-  }
-
 protected:
   void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = GetDebugger().GetSelectedTarget().get();
@@ -1850,7 +1811,7 @@ class CommandObjectPlatformInstall : public CommandObjectParsed {
             "Install a target (bundle or executable file) to the remote end.",
             "platform target-install <local-thing> <remote-sandbox>", 0) {
     CommandArgumentData local_arg{eArgTypePath, eArgRepeatPlain};
-    CommandArgumentData remote_arg{eArgTypePath, eArgRepeatPlain};
+    CommandArgumentData remote_arg{eArgTypeRemotePath, eArgRepeatPlain};
     m_arguments.push_back({local_arg});
     m_arguments.push_back({remote_arg});
   }
diff --git a/lldb/source/Commands/CommandObjectPlugin.cpp b/lldb/source/Commands/CommandObjectPlugin.cpp
index f22885144b09b3..da3b5f0518a690 100644
--- a/lldb/source/Commands/CommandObjectPlugin.cpp
+++ b/lldb/source/Commands/CommandObjectPlugin.cpp
@@ -36,13 +36,6 @@ class CommandObjectPluginLoad : public CommandObjectParsed {
 
   ~CommandObjectPluginLoad() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     size_t argc = command.GetArgumentCount();
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 722b0e0c376be8..7cd5ad656f1b05 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -143,14 +143,6 @@ class CommandObjectProcessLaunch : public CommandObjectProcessLaunchOrAttach {
 
   ~CommandObjectProcessLaunch() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
   Options *GetOptions() override { return &m_all_options; }
 
   std::optional<std::string> GetRepeatCommand(Args &current_command_args,
@@ -1015,9 +1007,7 @@ class CommandObjectProcessLoad : public CommandObjectParsed {
                            OptionElementVector &opt_element_vector) override {
     if (!m_exe_ctx.HasProcessScope())
       return;
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
+    CommandObject::HandleArgumentCompletion(request, opt_element_vector);
   }
 
   Options *GetOptions() override { return &m_options; }
@@ -1292,13 +1282,6 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_options; }
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
   class CommandOptions : public Options {
   public:
     CommandOptions() = default;
diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp
index a4d53e5c8dd5f1..4ffdde1ee09f9f 100644
--- a/lldb/source/Commands/CommandObjectRegister.cpp
+++ b/lldb/source/Commands/CommandObjectRegister.cpp
@@ -80,9 +80,7 @@ class CommandObjectRegisterRead : public CommandObjectParsed {
                            OptionElementVector &opt_element_vector) override {
     if (!m_exe_ctx.HasProcessScope())
       return;
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eRegisterCompletion, request, nullptr);
+    CommandObject::HandleArgumentCompletion(request, opt_element_vector);
   }
 
   Options *GetOptions() override { return &m_option_group; }
@@ -440,8 +438,7 @@ different for the same register when connected to different debug servers.)");
                            OptionElementVector &opt_element_vector) override {
     if (!m_exe_ctx.HasProcessScope() || request.GetCursorIndex() != 0)
       return;
-    CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eRegisterCompletion, request, nullptr);
+    CommandObject::HandleArgumentCompletion(request, opt_element_vector);
   }
 
 protected:
diff --git a/lldb/source/Commands/CommandObjectSession.cpp b/lldb/source/Commands/CommandObjectSession.cpp
index d140bdfdba57b3..28506d6c59512d 100644
--- a/lldb/source/Commands/CommandObjectSession.cpp
+++ b/lldb/source/Commands/CommandObjectSession.cpp
@@ -28,13 +28,6 @@ class CommandObjectSessionSave : public CommandObjectParsed {
 
   ~CommandObjectSessionSave() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
 protected:
   void DoExecute(Args &args, CommandReturnObject &result) override {
     llvm::StringRef file_path;
diff --git a/lldb/source/Commands/CommandObjectSettings.cpp b/lldb/source/Commands/CommandObjectSettings.cpp
index 5fb7dcc80279fd..0cf3d1daf7f528 100644
--- a/lldb/source/Commands/CommandObjectSettings.cpp
+++ b/lldb/source/Commands/CommandObjectSettings.cpp
@@ -262,14 +262,6 @@ class CommandObjectSettingsShow : public CommandObjectParsed {
 
   ~CommandObjectSettingsShow() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eSettingsNameCompletion, request,
-        nullptr);
-  }
-
 protected:
   void DoExecute(Args &args, CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishResult);
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index c3ecdb7700c256..4e006e4bb0e0fc 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -257,13 +257,6 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_option_group; }
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
@@ -2789,13 +2782,6 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_option_group; }
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
 protected:
   OptionGroupOptions m_option_group;
   OptionGroupUUID m_uuid_option_group;
@@ -3233,7 +3219,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
       : CommandObjectParsed(
             interpreter, "target modules list",
             "List current executable and dependent shared library images.") {
-    CommandArgumentData module_arg{eArgTypeShlibName, eArgRepeatStar};
+    CommandArgumentData module_arg{eArgTypeModule, eArgRepeatStar};
     m_arguments.push_back({module_arg});
   }
 
@@ -4343,13 +4329,6 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
 
   ~CommandObjectTargetSymbolsAdd() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
-  }
-
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
@@ -5195,8 +5174,7 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed {
                            OptionElementVector &opt_element_vector) override {
     if (request.GetCursorIndex())
       return;
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eStopHookIDCompletion, request, nullptr);
+    CommandObject::HandleArgumentCompletion(request, opt_element_vector);
   }
 
 protected:
@@ -5251,8 +5229,7 @@ class CommandObjectTargetStopHookEnableDisable : public CommandObjectParsed {
                            OptionElementVector &opt_element_vector) override {
     if (request.GetCursorIndex())
       return;
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eStopHookIDCompletion, request, nullptr);
+    CommandObject::HandleArgumentCompletion(request, opt_element_vector);
   }
 
 protected:
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index a1e7e3f11361e7..52e493b13c61ca 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -403,10 +403,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
                            OptionElementVector &opt_element_vector) override {
     if (request.GetCursorIndex())
       return;
-
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eThreadIndexCompletion, request,
-        nullptr);
+    CommandObject::HandleArgumentCompletion(request, opt_element_vector);
   }
 
   Options *GetOptions() override { return &m_all_options; }
@@ -663,14 +660,6 @@ class CommandObjectThreadContinue : public CommandObjectParsed {
 
   ~CommandObjectThreadContinue() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eThreadIndexCompletion, request,
-        nullptr);
-  }
-
   void DoExecute(Args &command, CommandReturnObject &result) override {
     bool synchronous_execution = m_interpreter.GetSynchronous();
 
diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp
index f76420f3cc6837..036b8e9d9def13 100644
--- a/lldb/source/Commands/CommandObjectType.cpp
+++ b/lldb/source/Commands/CommandObjectType.cpp
@@ -1758,14 +1758,6 @@ class CommandObjectTypeCategoryDefine : public CommandObjectParsed {
 
   ~CommandObjectTypeCategoryDefine() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request,
-        nullptr);
-  }
-
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
@@ -1859,14 +1851,6 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed {
 
   ~CommandObjectTypeCategoryEnable() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request,
-        nullptr);
-  }
-
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
@@ -1926,14 +1910,6 @@ class CommandObjectTypeCategoryDelete : public CommandObjectParsed {
 
   ~CommandObjectTypeCategoryDelete() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request,
-        nullptr);
-  }
-
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
@@ -2033,14 +2009,6 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed {
 
   ~CommandObjectTypeCategoryDisable() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request,
-        nullptr);
-  }
-
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp
index 438a16c50bd67f..5b74b1ae43accc 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp
@@ -831,16 +831,6 @@ corresponding to the byte size of the data type.");
 
   ~CommandObjectWatchpointSetVariable() override = default;
 
-  void
-  HandleArgumentCompletion(CompletionRequest &request,
-                           OptionElementVector &opt_element_vector) override {
-    if (request.GetCursorIndex() != 0)
-      return;
-    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
-        GetCommandInterpreter(), lldb::eVariablePathCompletion, request,
-        nullptr);
-  }
-
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index 6ed0fd1f1ddbd9..93c53e89f7d1ae 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -305,6 +305,43 @@ void CommandObject::HandleCompletion(CompletionRequest &request) {
   }
 }
 
+void CommandObject::HandleArgumentCompletion(
+    CompletionRequest &request, OptionElementVector &opt_element_vector) {
+  size_t num_arg_entries = GetNumArgumentEntries();
+  if (num_arg_entries != 1)
+    return;
+
+  CommandArgumentEntry *entry_ptr = GetArgumentEntryAtIndex(0);
+  if (!entry_ptr) {
+    assert(entry_ptr && "We said there was one entry, but there wasn't.");
+    return; // Not worth crashing if asserts are off...
+  }
+  
+  CommandArgumentEntry &entry = *entry_ptr;
+  // For now, we only handle the simple case of one homogenous argument type.
+  if (entry.size() != 1)
+    return;
+
+  // Look up the completion type, and if it has one, invoke it:
+  const CommandObject::ArgumentTableEntry *arg_entry =
+      FindArgumentDataByType(entry[0].arg_type);
+  const ArgumentRepetitionType repeat = entry[0].arg_repetition;
+
+  if (arg_entry == nullptr || arg_entry->completion_type == lldb::eNoCompletion)
+    return;
+
+  // FIXME: This should be handled higher in the Command Parser.
+  // Check the case where this command only takes one argument, and don't do
+  // the completion if we aren't on the first entry:
+  if (repeat == eArgRepeatPlain && request.GetCursorIndex() != 0)
+    return;
+
+  lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
+      GetCommandInterpreter(), arg_entry->completion_type, request, nullptr);
+
+}
+
+
 bool CommandObject::HelpTextContainsWord(llvm::StringRef search_word,
                                          bool search_short_help,
                                          bool search_long_help,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp
index 6b063f3bd88d85..795355b57a06db 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp
@@ -22,6 +22,7 @@ static constexpr Log::Category g_categories[] = {
     {{"map"},
      {"log insertions of object files into DWARF debug maps"},
      DWARFLog::DebugMap},
+    {{"split"}, {"log split DWARF related activities"}, DWARFLog::SplitDwarf},
 };
 
 static Log::Channel g_channel(g_categories, DWARFLog::DebugInfo);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h
index 662aa6757e2ffc..7f254a1162bd10 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h
@@ -20,6 +20,7 @@ enum class DWARFLog : Log::MaskType {
   DebugMap = Log::ChannelFlag<2>,
   Lookups = Log::ChannelFlag<3>,
   TypeCompletion = Log::ChannelFlag<4>,
+  SplitDwarf = Log::ChannelFlag<5>,
   LLVM_MARK_AS_BITMASK_ENUM(TypeCompletion)
 };
 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 42211b9a21b0e3..84ff4c2565a050 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -4349,26 +4349,60 @@ SymbolFileDWARFDebugMap *SymbolFileDWARF::GetDebugMapSymfile() {
 
 const std::shared_ptr<SymbolFileDWARFDwo> &SymbolFileDWARF::GetDwpSymbolFile() {
   llvm::call_once(m_dwp_symfile_once_flag, [this]() {
+    // Create a list of files to try and append .dwp to.
+    FileSpecList symfiles;
+    // Append the module's object file path.
+    const FileSpec module_fspec = m_objfile_sp->GetModule()->GetFileSpec();
+    symfiles.Append(module_fspec);
+    // Append the object file for this SymbolFile only if it is different from
+    // the module's file path. Our main module could be "a.out", our symbol file
+    // could be "a.debug" and our ".dwp" file might be "a.debug.dwp" instead of
+    // "a.out.dwp".
+    const FileSpec symfile_fspec(m_objfile_sp->GetFileSpec());
+    if (symfile_fspec != module_fspec) {
+      symfiles.Append(symfile_fspec);
+    } else {
+      // If we don't have a separate debug info file, then try stripping the
+      // extension. The main module could be "a.debug" and the .dwp file could
+      // be "a.dwp" instead of "a.debug.dwp".
+      ConstString filename_no_ext =
+          module_fspec.GetFileNameStrippingExtension();
+      if (filename_no_ext != module_fspec.GetFilename()) {
+        FileSpec module_spec_no_ext(module_fspec);
+        module_spec_no_ext.SetFilename(filename_no_ext);
+        symfiles.Append(module_spec_no_ext);
+      }
+    }
+    Log *log = GetLog(DWARFLog::SplitDwarf);
+    FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
     ModuleSpec module_spec;
     module_spec.GetFileSpec() = m_objfile_sp->GetFileSpec();
-    module_spec.GetSymbolFileSpec() =
-        FileSpec(m_objfile_sp->GetModule()->GetFileSpec().GetPath() + ".dwp");
-
     module_spec.GetUUID() = m_objfile_sp->GetUUID();
-    FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
-    FileSpec dwp_filespec =
-        PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
-    if (FileSystem::Instance().Exists(dwp_filespec)) {
-      DataBufferSP dwp_file_data_sp;
-      lldb::offset_t dwp_file_data_offset = 0;
-      ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
-          GetObjectFile()->GetModule(), &dwp_filespec, 0,
-          FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp,
-          dwp_file_data_offset);
-      if (!dwp_obj_file)
-        return;
-      m_dwp_symfile = std::make_shared<SymbolFileDWARFDwo>(
-          *this, dwp_obj_file, DIERef::k_file_index_mask);
+    for (const auto &symfile : symfiles.files()) {
+      module_spec.GetSymbolFileSpec() =
+          FileSpec(symfile.GetPath() + ".dwp", symfile.GetPathStyle());
+      LLDB_LOG(log, "Searching for DWP using: \"{0}\"",
+               module_spec.GetSymbolFileSpec());
+      FileSpec dwp_filespec =
+          PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
+      if (FileSystem::Instance().Exists(dwp_filespec)) {
+        LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec);
+        DataBufferSP dwp_file_data_sp;
+        lldb::offset_t dwp_file_data_offset = 0;
+        ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
+            GetObjectFile()->GetModule(), &dwp_filespec, 0,
+            FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp,
+            dwp_file_data_offset);
+        if (dwp_obj_file) {
+          m_dwp_symfile = std::make_shared<SymbolFileDWARFDwo>(
+              *this, dwp_obj_file, DIERef::k_file_index_mask);
+          break;
+        }
+      }
+    }
+    if (!m_dwp_symfile) {
+      LLDB_LOG(log, "Unable to locate for DWP file for: \"{0}\"",
+               m_objfile_sp->GetModule()->GetFileSpec());
     }
   });
   return m_dwp_symfile;
diff --git a/lldb/test/API/commands/help/TestHelp.py b/lldb/test/API/commands/help/TestHelp.py
index 95ffdb3cc8b18b..f0f5bcb3218011 100644
--- a/lldb/test/API/commands/help/TestHelp.py
+++ b/lldb/test/API/commands/help/TestHelp.py
@@ -104,7 +104,7 @@ def test_help_image_du_line_should_work(self):
     def test_help_image_list_shows_positional_args(self):
         """Command 'help image list' should describe positional args."""
         # 'image' is an alias for 'target modules'.
-        self.expect("help image list", substrs=["<shlib-name> [...]"])
+        self.expect("help image list", substrs=["<module> [...]"])
 
     @no_debug_info_test
     def test_help_target_variable_syntax(self):
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
index a47209931c3840..9a8149065b6e58 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
@@ -1,12 +1,16 @@
 // REQUIRES: lld
 
+// Now test with DWARF5
 // RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o
 // RUN: ld.lld %t.dwarf5.o -o %t.dwarf5
 // RUN: llvm-dwp %t.dwarf5.dwo -o %t.dwarf5.dwp
 // RUN: rm %t.dwarf5.dwo
 // RUN: llvm-objcopy --only-keep-debug %t.dwarf5 %t.dwarf5.debug
 // RUN: llvm-objcopy --strip-all --add-gnu-debuglink=%t.dwarf5.debug %t.dwarf5
-// RUN: %lldb %t.dwarf5 -o "target variable a" -b | FileCheck %s
+// RUN: %lldb \
+// RUN:   -O "log enable dwarf split" \
+// RUN:   -o "target variable a" \
+// RUN:   -b %t.dwarf5 | FileCheck %s
 
 // Run one time with the index cache enabled to populate the index cache. When
 // we populate the index cache we have to parse all of the DWARF debug info
@@ -34,6 +38,31 @@
 // RUN:   -o "statistics dump" \
 // RUN:   %t.dwarf5 -b | FileCheck %s -check-prefix=CACHED
 
+// Make sure that if we load the "%t.dwarf5.debug" file, that we can find and
+// load the .dwo file from the .dwp when it is "%t.dwarf5.dwp"
+// RUN: %lldb %t.dwarf5.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG
+
+// Make sure that if we load the "%t.dwarf5" file, that we can find and
+// load the .dwo file from the .dwp when it is "%t.dwarf5.debug.dwp"
+// RUN: mv %t.dwarf5.dwp %t.dwarf5.debug.dwp
+// RUN: %lldb %t.dwarf5 -o "b main" -b | FileCheck %s -check-prefix=DEBUG
+
+// Make sure that if we load the "%t.dwarf5.debug" file, that we can find and
+// load the .dwo file from the .dwp when it is "%t.dwarf5.debug.dwp"
+// RUN: %lldb %t.dwarf5.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG
+
+// Make sure that if we remove the .dwp file we see an appropriate error.
+// RUN: rm %t.dwarf5.debug.dwp
+// RUN: %lldb \
+// RUN:   -O "log enable dwarf split" \
+// RUN:   -o "b main" \
+// RUN:   -b %t.dwarf5 2>&1 | FileCheck %s -check-prefix=NODWP
+
+// RUN: %lldb \
+// RUN:   -O "log enable dwarf split" \
+// RUN:   -o "b main" \
+// RUN:   -b %t.dwarf5.debug 2>&1 | FileCheck %s -check-prefix=NODWP
+
 // Now test with DWARF4
 // RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o
 // RUN: ld.lld %t.dwarf4.o -o %t.dwarf4
@@ -41,7 +70,10 @@
 // RUN: rm %t.dwarf4.dwo
 // RUN: llvm-objcopy --only-keep-debug %t.dwarf4 %t.dwarf4.debug
 // RUN: llvm-objcopy --strip-all --add-gnu-debuglink=%t.dwarf4.debug %t.dwarf4
-// RUN: %lldb %t.dwarf4 -o "target variable a" -b | FileCheck %s
+// RUN: %lldb \
+// RUN:   -O "log enable dwarf split" \
+// RUN:   -o "target variable a" \
+// RUN:   -b %t.dwarf4 | FileCheck %s
 
 // Run one time with the index cache enabled to populate the index cache. When
 // we populate the index cache we have to parse all of the DWARF debug info
@@ -69,6 +101,46 @@
 // RUN:   -o "statistics dump" \
 // RUN:   %t.dwarf4 -b | FileCheck %s -check-prefix=CACHED
 
+// Make sure that if we load the "%t.dwarf4.debug" file, that we can find and
+// load the .dwo file from the .dwp when it is "%t.dwarf4.dwp"
+// RUN: %lldb %t.dwarf4.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG
+
+// Make sure that if we load the "%t.dwarf4" file, that we can find and
+// load the .dwo file from the .dwp when it is "%t.dwarf4.debug.dwp"
+// RUN: mv %t.dwarf4.dwp %t.dwarf4.debug.dwp
+// RUN: %lldb %t.dwarf4 -o "b main" -b | FileCheck %s -check-prefix=DEBUG
+
+// Make sure that if we load the "%t.dwarf4.debug" file, that we can find and
+// load the .dwo file from the .dwp when it is "%t.dwarf4.debug.dwp"
+// RUN: %lldb %t.dwarf4.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG
+
+// Make sure that if we remove the .dwp file we see an appropriate error.
+// RUN: rm %t.dwarf4.debug.dwp
+// RUN: %lldb \
+// RUN:   -O "log enable dwarf split" \
+// RUN:   -o "b main" \
+// RUN:   -b %t.dwarf4 2>&1 | FileCheck %s -check-prefix=NODWP
+
+// RUN: %lldb \
+// RUN:   -O "log enable dwarf split" \
+// RUN:   -o "b main" \
+// RUN:   -b %t.dwarf4.debug 2>&1 | FileCheck %s -check-prefix=NODWP
+
+// Test if we have a GNU build ID in our main executable and in our debug file,
+// and we have a .dwp file that doesn't, that we can still load our .dwp file.
+// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o
+// RUN: ld.lld %t.o --build-id=md5 -o %t
+// RUN: llvm-dwp %t.dwo -o %t.dwp
+// RUN: rm %t.dwo
+// RUN: llvm-objcopy --only-keep-debug %t %t.debug
+// RUN: llvm-objcopy --strip-all --add-gnu-debuglink=%t.debug %t
+// RUN: %lldb \
+// RUN:   -O "log enable dwarf split" \
+// RUN:   -o "target variable a" \
+// RUN:   -b %t | FileCheck %s
+
+// CHECK: Searching for DWP using:
+// CHECK: Found DWP file:
 // CHECK: (A) a = (x = 47)
 
 // CACHE: script lldb.target.modules[0].FindTypes('::A').GetTypeAtIndex(0)
@@ -83,6 +155,16 @@
 // CACHED-NEXT: }
 // CACHED: "totalDebugInfoIndexLoadedFromCache": 1
 
+// Make sure debug information was loaded by verifying that the
+// DEBUG: Breakpoint 1: where = dwp-separate-debug-file.cpp.tmp.dwarf{{[45]}}{{(\.debug)?}}`main + {{[0-9]+}} at dwp-separate-debug-file.cpp:{{[0-9]+}}:{{[0-9]+}}, address = {{0x[0-9a-fA-F]+}}
+
+// Make sure if we load the stripped binary or the debug info file with no .dwp
+// nor any .dwo files that we are not able to fine the .dwp or .dwo files.
+// NODWP: Searching for DWP using:
+// NODWP: Searching for DWP using:
+// NODWP: Unable to locate for DWP file for:
+// NODWP: unable to locate separate debug file (dwo, dwp). Debugging will be degraded.
+
 struct A {
   int x = 47;
 };
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 075ff0eb28ac0a..02ec869725af2f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -224,12 +224,15 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
           Encoding == dwarf::DW_ATE_float || Encoding == dwarf::DW_ATE_UTF ||
           Encoding == dwarf::DW_ATE_boolean ||
           Encoding == dwarf::DW_ATE_complex_float ||
+          Encoding == dwarf::DW_ATE_signed_fixed ||
+          Encoding == dwarf::DW_ATE_unsigned_fixed ||
           (Ty->getTag() == dwarf::DW_TAG_unspecified_type &&
            Ty->getName() == "decltype(nullptr)")) &&
          "Unsupported encoding");
   return Encoding == dwarf::DW_ATE_unsigned ||
          Encoding == dwarf::DW_ATE_unsigned_char ||
          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+         Encoding == llvm::dwarf::DW_ATE_unsigned_fixed ||
          Ty->getTag() == dwarf::DW_TAG_unspecified_type;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 85bd33e4efbd0f..5b32b34079f44e 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -323,6 +323,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f32);
   }
 
+  bool isPackedFP16InputMods() const {
+    return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::v2f16);
+  }
 
   bool isVReg() const {
     return isRegClass(AMDGPU::VGPR_32RegClassID) ||
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 8bf05682cbe7ea..d16d8ebd41a54f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -219,8 +219,10 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   default:
     return false;
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2BF16:
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index cd14c12a8a80c6..97c723752b70b9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1289,9 +1289,8 @@ def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
 
 class PackedFPInputModsMatchClass <int opSize> : AsmOperandClass {
   let Name = "PackedFP"#opSize#"InputMods";
-  let ParserMethod = "parseRegOrImm";
-  let PredicateMethod = "isRegOrImm";
-//  let PredicateMethod = "isPackedFP"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImmWithFPInputMods";
+  let PredicateMethod = "isPackedFP"#opSize#"InputMods";
 }
 
 class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass {
@@ -1305,7 +1304,7 @@ def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>;
 def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>;
 
 class PackedFPInputMods <PackedFPInputModsMatchClass matchClass> : InputMods <matchClass> {
-//  let PrintMethod = "printPackedFPInputMods";
+  let PrintMethod = "printOperandAndFPInputMods";
 }
 
 class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <matchClass> {
@@ -1606,8 +1605,11 @@ class getSrcMod <ValueType VT, bit IsTrue16 = 0> {
 }
 
 class getOpSelMod <ValueType VT> {
-  Operand ret = !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
-                    FP16InputMods, IntOpSelMods);
+  Operand ret = !cond(!eq(VT, f16) : FP16InputMods,
+                      !eq(VT, bf16) : FP16InputMods,
+                      !eq(VT, v2f16) : PackedF16InputMods,
+                      !eq(VT, v2bf16) : PackedF16InputMods,
+                      1 : IntOpSelMods);
 }
 
 // Return type of input modifiers operand specified input operand for DPP
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0d4057b3ddd109..99f8e8ede4ace9 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1,3 +1,4 @@
+
 //===-- VOP1Instructions.td - Vector Instruction Definitions --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -565,7 +566,7 @@ class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
 def VOPProfileCVT_F32_F8    : VOPProfile_Base_CVT_F32_F8 <f32>;
 def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>;
 
-let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0,
+let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
     SchedRW = [WriteFloatCvt] in {
   defm V_CVT_F32_FP8    : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>;
   defm V_CVT_F32_BF8    : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>;
@@ -653,7 +654,7 @@ class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
          (inst_e32 $src))
 >;
 
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
   foreach Index = [0, 1, 2, 3] in {
     def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
                                V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
@@ -670,7 +671,7 @@ class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
          (inst_e32 $src))
 >;
 
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
   foreach Index = [0, -1] in {
     def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
                                   V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_OP_SEL_e64>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 35cffa22f45929..396ae9c9d92eea 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -640,7 +640,7 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32
 let SubtargetPredicate = isGFX940Plus in
 defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
 
-let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0,
+let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
     SchedRW = [WriteFloatCvt] in {
   let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
     defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
@@ -667,6 +667,7 @@ class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst>
           !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
 >;
 
+let OtherPredicates = [HasFP8ConversionInsts] in {
 foreach Index = [0, -1] in {
   def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
   def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
@@ -676,6 +677,7 @@ foreach Index = [0, 1, 2, 3] in {
   def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
   def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
 }
+}
 
 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
   // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
@@ -866,20 +868,9 @@ def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC_LO>;
 def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>;
 }
 
-class VOP3_DOT_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile<P, Features> {
+class VOP3_DOT_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
   let HasClamp = 0;
   let HasOMod = 0;
-  // Override modifiers for bf16(i16) (same as float modifiers).
-  let HasSrc0Mods = 1;
-  let HasSrc1Mods = 1;
-  let HasSrc2Mods = 1;
-  let Src0ModVOP3DPP = FPVRegInputMods;
-  let Src1ModVOP3DPP = FPVRegInputMods;
-  let Src2ModVOP3DPP = FP16InputMods;
-  let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
-                                     HasClamp, HasOMod, FP16InputMods,
-                                     FP16InputMods, FP16InputMods>.ret;
-  let AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs, HasClamp, HasOMod, 1, 1, 1>.ret;
 }
 
 let SubtargetPredicate = isGFX11Plus in {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c47c13dbb84025..801afabbdb1401 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -210,7 +210,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
 class VOP3_Real_Gen <VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> :
   VOP3_Real <ps, Gen.Subtarget, asm_name> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
+  let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
   let DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
@@ -1349,7 +1349,7 @@ class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
                      string opName = ps.OpName> :
   VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
+  let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
   let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 6c7e88fbe2eb86..619c7dc69f9b27 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2765,12 +2765,40 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::PS_vloadrw_nt_ai:
   case Hexagon::V6_vL32b_ai:
   case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_pred_ai:
+  case Hexagon::V6_vS32b_npred_ai:
   case Hexagon::V6_vS32b_qpred_ai:
   case Hexagon::V6_vS32b_nqpred_ai:
+  case Hexagon::V6_vS32b_new_ai:
+  case Hexagon::V6_vS32b_new_pred_ai:
+  case Hexagon::V6_vS32b_new_npred_ai:
+  case Hexagon::V6_vS32b_nt_pred_ai:
+  case Hexagon::V6_vS32b_nt_npred_ai:
+  case Hexagon::V6_vS32b_nt_new_ai:
+  case Hexagon::V6_vS32b_nt_new_pred_ai:
+  case Hexagon::V6_vS32b_nt_new_npred_ai:
+  case Hexagon::V6_vS32b_nt_qpred_ai:
+  case Hexagon::V6_vS32b_nt_nqpred_ai:
   case Hexagon::V6_vL32b_nt_ai:
   case Hexagon::V6_vS32b_nt_ai:
   case Hexagon::V6_vL32Ub_ai:
   case Hexagon::V6_vS32Ub_ai:
+  case Hexagon::V6_vL32b_cur_ai:
+  case Hexagon::V6_vL32b_tmp_ai:
+  case Hexagon::V6_vL32b_pred_ai:
+  case Hexagon::V6_vL32b_npred_ai:
+  case Hexagon::V6_vL32b_cur_pred_ai:
+  case Hexagon::V6_vL32b_cur_npred_ai:
+  case Hexagon::V6_vL32b_tmp_pred_ai:
+  case Hexagon::V6_vL32b_tmp_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_ai:
+  case Hexagon::V6_vL32b_nt_tmp_ai:
+  case Hexagon::V6_vL32b_nt_pred_ai:
+  case Hexagon::V6_vL32b_nt_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_pred_ai:
+  case Hexagon::V6_vL32b_nt_cur_npred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_pred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_npred_ai:
   case Hexagon::V6_vgathermh_pseudo:
   case Hexagon::V6_vgathermw_pseudo:
   case Hexagon::V6_vgathermhw_pseudo:
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 7e3dcb3283caba..8bac41372b5a83 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -399,9 +399,9 @@ void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF,
 
   // Optimize compile time offset case
   StackOffset Offset = StackOffset::getScalable(Amount);
-  if (STI.getRealMinVLen() == STI.getRealMaxVLen()) {
+  if (auto VLEN = STI.getRealVLen()) {
     // 1. Multiply the number of v-slots by the (constant) length of register
-    const int64_t VLENB = STI.getRealMinVLen() / 8;
+    const int64_t VLENB = *VLEN / 8;
     assert(Amount % 8 == 0 &&
            "Reserve the stack by the multiple of one vector size.");
     const int64_t NumOfVReg = Amount / 8;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 80797e36ad40fe..904f1d7fdf9065 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -577,9 +577,8 @@ void RISCVDAGToDAGISel::selectVSETVLI(SDNode *Node) {
   SDValue VLOperand;
   unsigned Opcode = RISCV::PseudoVSETVLI;
   if (auto *C = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
-    const unsigned VLEN = Subtarget->getRealMinVLen();
-    if (VLEN == Subtarget->getRealMaxVLen())
-      if (VLEN / RISCVVType::getSEWLMULRatio(SEW, VLMul) == C->getZExtValue())
+    if (auto VLEN = Subtarget->getRealVLen())
+      if (*VLEN / RISCVVType::getSEWLMULRatio(SEW, VLMul) == C->getZExtValue())
         VLMax = true;
   }
   if (VLMax || isAllOnesConstant(Node->getOperand(1))) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9ab6895aed521e..874c851cd9147a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8092,12 +8092,11 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     // If we're compiling for an exact VLEN value, we can always perform
     // the insert in m1 as we can determine the register corresponding to
     // the index in the register group.
-    const unsigned MinVLen = Subtarget.getRealMinVLen();
-    const unsigned MaxVLen = Subtarget.getRealMaxVLen();
     const MVT M1VT = getLMUL1VT(ContainerVT);
-    if (MinVLen == MaxVLen && ContainerVT.bitsGT(M1VT)) {
+    if (auto VLEN = Subtarget.getRealVLen();
+        VLEN && ContainerVT.bitsGT(M1VT)) {
       EVT ElemVT = VecVT.getVectorElementType();
-      unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+      unsigned ElemsPerVReg = *VLEN / ElemVT.getFixedSizeInBits();
       unsigned RemIdx = OrigIdx % ElemsPerVReg;
       unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
       unsigned ExtractIdx =
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index ca519dbc4c0359..9d1f01dffaaf47 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -283,8 +283,8 @@ void RISCVRegisterInfo::lowerVSPILL(MachineBasicBlock::iterator II) const {
 
   Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
   // Optimize for constant VLEN.
-  if (STI.getRealMinVLen() == STI.getRealMaxVLen()) {
-    const int64_t VLENB = STI.getRealMinVLen() / 8;
+  if (auto VLEN = STI.getRealVLen()) {
+    const int64_t VLENB = *VLEN / 8;
     int64_t Offset = VLENB * LMUL;
     STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset);
   } else {
@@ -360,8 +360,8 @@ void RISCVRegisterInfo::lowerVRELOAD(MachineBasicBlock::iterator II) const {
 
   Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
   // Optimize for constant VLEN.
-  if (STI.getRealMinVLen() == STI.getRealMaxVLen()) {
-    const int64_t VLENB = STI.getRealMinVLen() / 8;
+  if (auto VLEN = STI.getRealVLen()) {
+    const int64_t VLENB = *VLEN / 8;
     int64_t Offset = VLENB * LMUL;
     STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset);
   } else {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 8c55efa69a6a5f..4b60d7aff22a0f 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -188,6 +188,14 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     unsigned VLen = getMaxRVVVectorSizeInBits();
     return VLen == 0 ? 65536 : VLen;
   }
+  // If we know the exact VLEN, return it.  Otherwise, return std::nullopt.
+  std::optional<unsigned> getRealVLen() const {
+    unsigned Min = getRealMinVLen();
+    if (Min != getRealMaxVLen())
+      return std::nullopt;
+    return Min;
+  }
+
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
   bool isSoftFPABI() const {
     return TargetABI == RISCVABI::ABI_LP64 ||
diff --git a/llvm/test/CodeGen/Hexagon/ldst_vector_offset.ll b/llvm/test/CodeGen/Hexagon/ldst_vector_offset.ll
new file mode 100644
index 00000000000000..15695e83501652
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/ldst_vector_offset.ll
@@ -0,0 +1,59 @@
+; REQUIRES: asserts
+; RUN: llc -O3 -march=hexagon < %s -o /dev/null
+; Make sure that this doesn't crash.
+; This test validates that the compiler would not assert when analyzing the
+; offset of V6_vS32b_pred_ai instruction
+
+%struct.pluto = type <{ ptr, i16, ptr }>
+
+@global = external hidden unnamed_addr constant [62 x i8], align 1
+@global.1 = external hidden unnamed_addr constant [47 x i8], align 1
+@global.2 = hidden local_unnamed_addr constant %struct.pluto <{ ptr @global, i16 892, ptr @global.1 }>, align 1
+@global.3 = local_unnamed_addr constant [1 x i32] zeroinitializer
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vd0.128B() #1
+
+; Function Attrs: noinline nounwind
+declare hidden fastcc void @zot(i32, i32, i32, i32) unnamed_addr #2
+
+; Function Attrs: noinline nounwind
+define void @barney(ptr nocapture %arg, ptr nocapture readnone %arg1, i8 signext %arg2, i32 %arg3, ptr nocapture readnone %arg4, ptr nocapture readnone %arg5, i32 %arg6, i32 %arg7, ptr nocapture readnone %arg8, ptr nocapture readnone %arg9, ptr nocapture readnone %arg10, ptr nocapture readnone %arg11, ptr nocapture readnone %arg12, ptr nocapture readnone %arg13, ptr nocapture readnone %arg14, ptr nocapture readnone %arg15, ptr nocapture readnone %arg16, ptr nocapture readnone %arg17) local_unnamed_addr #2 {
+bb:
+  %icmp = icmp ult i32 %arg3, 4
+  tail call void @llvm.assume(i1 %icmp) #3
+  %call = tail call <32 x i32> @llvm.hexagon.V6.vd0.128B() #3
+  br label %bb18
+
+bb18:                                             ; preds = %bb22, %bb
+  %phi = phi i32 [ %and, %bb22 ], [ %arg3, %bb ]
+  %phi19 = phi i32 [ %add23, %bb22 ], [ 4, %bb ]
+  %icmp20 = icmp eq i32 %phi, 0
+  br i1 %icmp20, label %bb21, label %bb22
+
+bb21:                                             ; preds = %bb18
+  %shl = shl i32 %phi19, 8
+  %getelementptr = getelementptr inbounds i8, ptr %arg, i32 %shl
+  %bitcast = bitcast ptr %getelementptr to ptr
+  store <32 x i32> %call, ptr %bitcast, align 128
+  br label %bb22
+
+bb22:                                             ; preds = %bb21, %bb18
+  %add = add nuw nsw i32 %phi, 1
+  %and = and i32 %add, 3
+  %add23 = add nuw nsw i32 %phi19, 1
+  %icmp24 = icmp eq i32 %add23, 8
+  br i1 %icmp24, label %bb25, label %bb18
+
+bb25:                                             ; preds = %bb22
+  tail call fastcc void @zot(i32 %arg6, i32 %arg7, i32 0, i32 %arg3)
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { noinline nounwind "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-small-data" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
diff --git a/llvm/test/DebugInfo/fixed-point.ll b/llvm/test/DebugInfo/fixed-point.ll
new file mode 100644
index 00000000000000..eaa67ead3f3136
--- /dev/null
+++ b/llvm/test/DebugInfo/fixed-point.ll
@@ -0,0 +1,32 @@
+;; This fixes https://github.com/llvm/llvm-project/issues/81555
+; REQUIRES: object-emission
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - -verify | FileCheck %s --check-prefix=VERIFY
+
+; VERIFY-NOT: error:
+
+; CHECK: {{.*}}:   DW_TAG_base_type
+; CHECK-NEXT:          DW_AT_name  ("var")
+; CHECK-NEXT:          DW_AT_encoding  (DW_ATE_signed_fixed)
+define void @func() !dbg !26 {
+entry:
+  %classifier = alloca i32, align 4
+  tail call void @llvm.dbg.value(metadata i32 32768, metadata !37, metadata !DIExpression()), !dbg !39
+  store i32 32768, ptr %classifier, align 4, !dbg !39
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!19}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "a", directory: "")
+!6 = !DIBasicType(name: "var", size: 32, encoding: DW_ATE_signed_fixed)
+!19 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DISubroutineType(types: null)
+!26 = distinct !DISubprogram(unit: !0, type: !3)
+!37 = !DILocalVariable(name: "intercept", arg: 2, scope: !26, file: !1, line: 7, type: !6)
+!39 = !DILocation(line: 0, scope: !26)
+
diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
index 244827196f485e..33ad5515a6357a 100644
--- a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
+++ b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
@@ -1,6 +1,6 @@
 # RUN: llc -filetype=obj -o %t.o %S/Inputs/main-ret-0.ll
 # RUN: llvm-jitlink -noexec \
-# RUN:   -sectcreate __data,%S/Inputs/sectcreate-data.txt:foo=0 \
+# RUN:   -sectcreate __data,%S/Inputs/sectcreate-data.txt@foo=0 \
 # RUN:   %t.o
 #
 # Use -sectcreate to create a section from a data file.
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 9a94162005e1f7..d288c02a22c921 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2116,6 +2116,12 @@ v_dot2_bf16_bf16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
 v_dot2_bf16_bf16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1]
 // GFX11: encoding: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
+v_dot2_bf16_bf16 v2, v0, 0x20004000, v2
+// GFX11: v_dot2_bf16_bf16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+
+v_dot2_bf16_bf16 v2, 0x20004000, v0, v2
+// GFX11: v_dot2_bf16_bf16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+
 v_dot2_f16_f16 v5, v1, v2, s3
 // GFX11: encoding: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00]
 
@@ -2161,6 +2167,12 @@ v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
 v_dot2_f16_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1]
 // GFX11: encoding: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
+v_dot2_f16_f16 v2, v0, 0x20004000, v2
+// GFX11: v_dot2_f16_f16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+
+v_dot2_f16_f16 v2, 0x20004000, v0, v2
+// GFX11: v_dot2_f16_f16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+
 v_fma_dx9_zero_f32 v5, v1, v2, s3
 // GFX11: encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 7674c02185b5f2..fc35a2e6b4f8f4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -1788,6 +1788,12 @@
 # GFX11: v_dot2_bf16_bf16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] ; encoding: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
 
+# GFX11: v_dot2_bf16_bf16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20
+
+# GFX11: v_dot2_bf16_bf16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20
+
 # GFX11: v_dot2_f16_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00]
 0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00
 
@@ -1833,6 +1839,12 @@
 # GFX11: v_dot2_f16_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] ; encoding: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
 
+# GFX11: v_dot2_f16_f16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20
+
+# GFX11: v_dot2_f16_f16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20
+
 # GFX11: v_fma_dx9_zero_f32 v5, v1, v2, s3       ; encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00]
 0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00
 
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 8ecae5dbe37202..4c05317036d1a3 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -415,10 +415,11 @@ def version_int(ver):
     config.available_features.add("llvm-dylib")
     config.substitutions.append(
         (
+            # libLLVM.so.19.0git
             "%llvmdylib",
-            "{}/libLLVM-{}{}".format(
-                config.llvm_shlib_dir, config.llvm_dylib_version, config.llvm_shlib_ext
-            ),
+            "{}/libLLVM{}.{}".format(
+                config.llvm_shlib_dir, config.llvm_shlib_ext, config.llvm_dylib_version
+            )
         )
     )
 
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index 1138b2ccf7bce7..b6f255d472d16f 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -44,7 +44,7 @@ config.build_examples = @LLVM_BUILD_EXAMPLES@
 config.enable_threads = @LLVM_ENABLE_THREADS@
 config.build_shared_libs = @BUILD_SHARED_LIBS@
 config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@
-config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@@LLVM_VERSION_SUFFIX@"
+config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@.@LLVM_VERSION_MINOR@@LLVM_VERSION_SUFFIX@"
 config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
 config.host_arch = "@HOST_ARCH@"
 config.have_opt_viewer_modules = @LLVM_HAVE_OPT_VIEWER_MODULES@
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index f6280779ded10f..f0b8310a32efd3 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -165,7 +165,7 @@ static cl::list<std::string>
 
 static cl::list<std::string>
     SectCreate("sectcreate",
-               cl::desc("given <sectname>,<filename>[:<sym>=<offset>,...]  "
+               cl::desc("given <sectname>,<filename>[@<sym>=<offset>,...]  "
                         "add the content of <filename> to <sectname>"),
                cl::cat(JITLinkCategory));
 
@@ -1683,7 +1683,7 @@ static Error addSectCreates(Session &S,
 
     StringRef SCArg(*SCItr);
 
-    auto [SectAndFileName, ExtraSymbolsString] = SCArg.split(':');
+    auto [SectAndFileName, ExtraSymbolsString] = SCArg.split('@');
     auto [SectName, FileName] = SectAndFileName.rsplit(',');
     if (SectName.empty())
       return make_error<StringError>("In -sectcreate=" + SCArg +
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index cafef5b5fad512..b73891b59f0264 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -1005,7 +1005,7 @@ TEST(STLExtras, Unique) {
 }
 
 TEST(STLExtras, UniqueNoPred) {
-  std::vector<uint32_t> V = {1, 5, 5, 4, 3, 3, 3};
+  std::vector<int> V = {1, 5, 5, 4, 3, 3, 3};
 
   auto I = llvm::unique(V);
 
diff --git a/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn
index 62b4af0635841a..ee60eee0da0fb8 100644
--- a/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn
@@ -6,6 +6,7 @@ static_library("ExtractAPI") {
     "//clang/lib/Basic",
     "//clang/lib/Frontend",
     "//clang/lib/Index",
+    "//clang/lib/InstallAPI",
     "//llvm/lib/Support",
     "//llvm/lib/TargetParser",
   ]
diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
index 4d79ac805ac193..6eae7e293dce61 100644
--- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
@@ -6,5 +6,9 @@ static_library("InstallAPI") {
     "//llvm/lib/Support",
     "//llvm/lib/TextAPI",
   ]
-  sources = [ "Context.cpp" ]
+  sources = [
+    "Context.cpp",
+    "FileList.cpp",
+    "HeaderFile.cpp",
+  ]
 }
diff --git a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn
index b60c5264d60cce..354934f4b18ab1 100644
--- a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn
@@ -13,6 +13,7 @@ group("unittests") {
     "Format:FormatTests",
     "Frontend:FrontendTests",
     "Index:IndexTests",
+    "InstallAPI:InstallAPITests",
     "Interpreter:ClangReplInterpreterTests",
     "Introspection:IntrospectionTests",
     "Lex:LexTests",
diff --git a/llvm/utils/gn/secondary/clang/unittests/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/InstallAPI/BUILD.gn
new file mode 100644
index 00000000000000..e27659457474f2
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/unittests/InstallAPI/BUILD.gn
@@ -0,0 +1,13 @@
+import("//third-party/unittest/unittest.gni")
+
+unittest("InstallAPITests") {
+  configs += [ "//llvm/utils/gn/build:clang_code" ]
+  deps = [
+    "//clang/lib/InstallAPI",
+    "//llvm/lib/Testing/Support",
+  ]
+  sources = [
+    "HeaderFileTest.cpp",
+    "FileListTest.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index a813bf3f508b2d..31f567d6df159d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -50,6 +50,7 @@ static_library("Orc") {
     "OrcABISupport.cpp",
     "OrcV2CBindings.cpp",
     "RTDyldObjectLinkingLayer.cpp",
+    "SectCreate.cpp",
     "SimpleRemoteEPC.cpp",
     "SpeculateAnalyses.cpp",
     "Speculation.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index ab4fd8e6403e76..3257f4b5ff2363 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -74,6 +74,7 @@ write_lit_config("lit_site_cfg") {
     "LLVM_USE_INTEL_JITEVENTS=0",
     "LLVM_USE_SANITIZER=",
     "LLVM_VERSION_MAJOR=$llvm_version_major",
+    "LLVM_VERSION_MINOR=$llvm_version_minor",
     "LLVM_VERSION_SUFFIX=git",
     "Python3_EXECUTABLE=$python_path",
     "TARGETS_TO_BUILD=$llvm_targets_to_build_string",
diff --git a/llvm/utils/release/github-upload-release.py b/llvm/utils/release/github-upload-release.py
index a8bb569d2fc999..14ec05062d88c8 100755
--- a/llvm/utils/release/github-upload-release.py
+++ b/llvm/utils/release/github-upload-release.py
@@ -77,20 +77,28 @@ def upload_files(repo, release, files):
 parser.add_argument("--token", type=str)
 parser.add_argument("--release", type=str)
 parser.add_argument("--user", type=str)
+parser.add_argument("--user-token", type=str)
 
 # Upload args
 parser.add_argument("--files", nargs="+", type=str)
 
 args = parser.parse_args()
 
-github = github.Github(args.token)
-llvm_org = github.get_organization("llvm")
+gh = github.Github(args.token)
+llvm_org = gh.get_organization("llvm")
 llvm_repo = llvm_org.get_repo("llvm-project")
 
 if args.user:
+    if not args.user_token:
+        print("--user-token option required when --user is used")
+        sys.exit(1)
     # Validate that this user is allowed to modify releases.
-    user = github.get_user(args.user)
-    team = llvm_org.get_team_by_slug("llvm-release-managers")
+    user = gh.get_user(args.user)
+    team = (
+        github.Github(args.user_token)
+        .get_organization("llvm")
+        .get_team_by_slug("llvm-release-managers")
+    )
     if not team.has_in_members(user):
         print("User {} is not a allowed to modify releases".format(args.user))
         sys.exit(1)
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 272bc3116c5fdc..92d844eefb7207 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -245,7 +245,7 @@ def MapOp : LinalgStructuredBase_Op<"map", [
           }
     ```
 
-    Shortened print form is available. Applies to simple maps with one 
+    Shortened print form is available. Applies to simple maps with one
     non-yield operation inside the body.
 
     The example above will be printed as:
@@ -458,6 +458,7 @@ def TransposeOp : LinalgStructuredBase_Op<"transpose", [
                              ::mlir::OperationState & odsState);
   }];
 
+  let hasFolder = 1;
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 }
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
index 8ba7c111aea6bb..b9cd15e2062669 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
@@ -16,6 +16,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/BuiltinTypes.td"
 include "mlir/IR/CommonAttrConstraints.td"
 include "mlir/IR/CommonTypeConstraints.td"
+include "mlir/IR/OpAsmInterface.td"
 include "mlir/IR/SymbolInterfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -78,7 +79,10 @@ def Mesh_MeshOp : Mesh_Op<"mesh", [Symbol]> {
 }
 
 def Mesh_MeshShapeOp : Mesh_Op<"mesh_shape", [
-  Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+    Pure,
+    DeclareOpInterfaceMethods<SymbolUserOpInterface>,
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>
+  ]> {
   let summary = "Get the shape of the mesh.";
   let arguments = (ins
     FlatSymbolRefAttr:$mesh,
@@ -101,7 +105,11 @@ def Mesh_MeshShapeOp : Mesh_Op<"mesh_shape", [
   ];
 }
 
-def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> {
+def Mesh_ShardOp : Mesh_Op<"shard", [
+    Pure,
+    SameOperandsAndResultType,
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>
+  ]> {
   let summary = "Annotate on how a tensor is sharded across a mesh.";
   let description = [{
     The mesh.shard operation is designed to specify and guide the sharding
@@ -194,7 +202,8 @@ def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> {
 
 def Mesh_ProcessMultiIndexOp : Mesh_Op<"process_multi_index", [
   Pure,
-  DeclareOpInterfaceMethods<SymbolUserOpInterface>
+  DeclareOpInterfaceMethods<SymbolUserOpInterface>,
+  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>
 ]> {
   let summary = "Get the multi index of current device along specified mesh axes.";
   let description = [{
@@ -221,7 +230,8 @@ def Mesh_ProcessMultiIndexOp : Mesh_Op<"process_multi_index", [
 
 def Mesh_ProcessLinearIndexOp : Mesh_Op<"process_linear_index", [
   Pure,
-  DeclareOpInterfaceMethods<SymbolUserOpInterface>
+  DeclareOpInterfaceMethods<SymbolUserOpInterface>,
+  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>
 ]> {
   let summary = "Get the linear index of the current device.";
   let description = [{
@@ -248,7 +258,10 @@ class Mesh_CollectiveCommunicationOpBase<
     string mnemonic, list<Trait> traits = []> :
     Mesh_Op<mnemonic,
       !listconcat(traits,
-      [DeclareOpInterfaceMethods<SymbolUserOpInterface>])> {
+        [
+          DeclareOpInterfaceMethods<SymbolUserOpInterface>,
+          DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>
+        ])> {
   dag commonArgs = (ins
     FlatSymbolRefAttr:$mesh,
     DefaultValuedAttr<Mesh_MeshAxesAttr, "{}">:$mesh_axes
@@ -258,7 +271,7 @@ class Mesh_CollectiveCommunicationOpBase<
 def Mesh_AllGatherOp : Mesh_CollectiveCommunicationOpBase<"all_gather", [
     Pure,
     SameOperandsAndResultElementType,
-    SameOperandsAndResultRank
+    SameOperandsAndResultRank,
   ]> {
   let summary = "All-gather over a device mesh.";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 41a14575ed1054..a00c9c31256c96 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -283,7 +283,13 @@ struct LevelType {
   }
   bool operator!=(const LevelType lhs) const { return !(*this == lhs); }
 
-  LevelType stripProperties() const { return LevelType(lvlBits & ~0xffff); }
+  LevelType stripStorageIrrelevantProperties() const {
+    // Properties other than `SoA` do not change the storage scheme of the
+    // sparse tensor.
+    constexpr uint64_t mask =
+        0xffff & ~static_cast<uint64_t>(LevelPropNonDefault::SoA);
+    return LevelType(lvlBits & ~mask);
+  }
 
   /// Get N of NOutOfM level type.
   constexpr uint64_t getN() const {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 24a5640d820e43..1a090ddb782fdb 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -24,6 +24,7 @@ struct COOSegment {
   std::pair<Level, Level> lvlRange; // [low, high)
   bool isSoA;
 
+  bool isAoS() const { return !isSoA; }
   bool isSegmentStart(Level l) const { return l == lvlRange.first; }
   bool inSegment(Level l) const {
     return l >= lvlRange.first && l < lvlRange.second;
@@ -337,7 +338,9 @@ class SparseTensorType {
   /// Returns the starting level of this sparse tensor type for a
   /// trailing COO region that spans **at least** two levels. If
   /// no such COO region is found, then returns the level-rank.
-  Level getCOOStart() const;
+  ///
+  /// DEPRECATED: use getCOOSegment instead;
+  Level getAoSCOOStart() const;
 
   /// Returns [un]ordered COO type for this sparse tensor type.
   RankedTensorType getCOOType(bool ordered) const;
diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
index fe9b16cb44b3da..d09c9e36f6ff88 100644
--- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -32,13 +32,11 @@ FailureOr<RankedTensorType>
 computeTransposedType(RankedTensorType rankedTensorType,
                       ArrayRef<int64_t> transposeVector);
 
-/// Given a tensor::PackOp, compute the permutation vector to shuffle the
-/// packed shape into the shape before any outer or inner permutations have
-/// been applied.
-/// i.e. for a pack from an ABCD layout to an ABCDba:
-/// The packed shape would be ABCDba.
-/// The pre-permutation shape would be AaBbCD.
-SmallVector<int64_t> getPackInverseDestPermutation(PackOp packOp);
+SmallVector<int64_t> getPackInverseDestPerm(tensor::PackOp packOp);
+SmallVector<int64_t> getUnPackInverseSrcPerm(tensor::UnPackOp unpackOp);
+
+SmallVector<int64_t> getUnPackInverseSrcPerm(tensor::UnPackOp,
+                                             PackingMetadata &metadata);
 
 /// A tensor.insert_slice is a cast-like operation if it merely rank-extends the
 /// source tensor or inserts the source tensor into a destination tensor with
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index a0f02f6a7f259d..919f5130e1760f 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1786,6 +1786,22 @@ void TransposeOp::getEffects(
                         getDpsInits());
 }
 
+LogicalResult TransposeOp::fold(FoldAdaptor adaptor,
+                                SmallVectorImpl<OpFoldResult> &result) {
+  // Single dimension transpose.
+  if (getPermutation().size() == 0) {
+    result.push_back(getInput());
+    return success();
+  }
+  // Identity permutation.
+  if (isIdentityPermutation(getPermutation())) {
+    result.push_back(getInput());
+    return success();
+  }
+
+  return failure();
+}
+
 //===----------------------------------------------------------------------===//
 // BroadcastOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 4ef8859fd5c430..299965bcfc3ab3 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3152,7 +3152,8 @@ DiagnosedSilenceableFailure transform::VectorizeOp::apply(
 
   // TODO: Check that the correct number of vectorSizes was provided.
   for (Operation *target : targets) {
-    if (!isa<linalg::LinalgOp, tensor::PadOp, tensor::PackOp>(target)) {
+    if (!isa<linalg::LinalgOp, tensor::PadOp, tensor::PackOp, tensor::UnPackOp>(
+            target)) {
       return mlir::emitSilenceableFailure(target->getLoc())
              << "Unsupported Op, cannot vectorize";
     }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 01b393644679c5..a17bc8e4cd318f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -237,7 +237,7 @@ FailureOr<LowerPackResult> linalg::lowerPack(RewriterBase &rewriter,
   PackingMetadata packingMetadata = computePackingMetadata(
       packedTensorType.getRank(), packOp.getInnerDimsPos());
   SmallVector<int64_t> packedToStripMinedShapePerm =
-      tensor::getPackInverseDestPermutation(packOp);
+      tensor::getPackInverseDestPerm(packOp);
 
   // 3. Compute the stripMinedShape: this is the packed shape before any outer
   // or inner permutations have been applied.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 2bd6929fea6142..ac043e87223dfe 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1405,8 +1405,7 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state,
 /// permutations.
 static SmallVector<int64_t> getTiledPackShape(tensor::PackOp packOp,
                                               ArrayRef<int64_t> destShape) {
-  return applyPermutation(destShape,
-                          tensor::getPackInverseDestPermutation(packOp));
+  return applyPermutation(destShape, tensor::getPackInverseDestPerm(packOp));
 }
 
 /// Create a TransferReadOp from `source` with static shape `readShape`. If the
@@ -1547,7 +1546,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
 
   // Create TransposeOp.
   auto destPermutation =
-      invertPermutationVector(tensor::getPackInverseDestPermutation(packOp));
+      invertPermutationVector(tensor::getPackInverseDestPerm(packOp));
   auto transposeOp = rewriter.create<vector::TransposeOp>(
       loc, shapeCastOp.getResult(), destPermutation);
 
@@ -1559,6 +1558,112 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
   return success();
 }
 
+/// Vectorize a `tensor::UnPackOp` to these 4 Ops:
+///   Vector::TransferReadOp - Reads a vector from the source tensor
+///   vector::TransposeOp - Transpose the Source tensor
+///   ShapeCastOp - Reshape the data based on the target.
+///   vector::TransferWriteOp. - Write the result vector back to the destination
+///   tensor
+static LogicalResult
+vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp,
+                          ArrayRef<int64_t> inputVectorSizes,
+                          SmallVectorImpl<Value> &newResults) {
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(unpackOp);
+
+  RankedTensorType unpackTensorType = unpackOp.getSourceType();
+
+  ArrayRef<int64_t> innerDimPos = unpackOp.getInnerDimsPos();
+  ArrayRef<int64_t> innerTiles = unpackOp.getStaticInnerTiles();
+
+  SmallVector<int64_t> readMaskShape(inputVectorSizes.begin(),
+                                     inputVectorSizes.end());
+  ArrayRef<int64_t> outerDimsPerm = unpackOp.getOuterDimsPerm();
+  ArrayRef<int64_t> sourceShape = unpackTensorType.getShape();
+
+  // ReadMask is the size of tensor used to read and apply mask. It is
+  // set like this: Let's say the vectorSize (VS) array is size 'N' and
+  // the sourceShape(SS) is 'M' where M >= N and InnerTileSizes (IT) of
+  // size M-N
+  // Thus:
+  // - initially: ReadMaskShape = vectorInputSizes
+  // - Divide all the readMaskShape locations pointed by innerDimPos
+  //   by the innerTileSize attribute value.
+  // - if outer_dims_perms is present: do that permutation on readMaskShape.
+  // - Append the remaining shape from SS
+  // E.g. let's say let's say unpackTensorType.getShape() = <8x8x32x16>
+  // inner Dim Pos = [0, 1] and Inner Tiles = [32, 16], vector_sizes are [512,
+  // 128] and outer_dims_perm is [1, 0] then read shape is:
+  //   ReadMaskShape(initial): [512, 128]
+  //   Final Value(after innerDim Adjustment): [512/32, 128/16]
+  //                                           = [16, 8]
+  //   After applying outer_dims_perm: [8, 16]
+  //   After appending the rest of the sourceShape: [8, 16, 32, 16]
+
+  for (auto [index, size] : enumerate(innerTiles)) {
+    readMaskShape[innerDimPos[index]] =
+        llvm::divideCeil(readMaskShape[innerDimPos[index]], size);
+  }
+  if (!outerDimsPerm.empty()) {
+    applyPermutationToVector(readMaskShape, outerDimsPerm);
+  }
+  readMaskShape.append(sourceShape.begin() + inputVectorSizes.size(),
+                       sourceShape.end());
+
+  ReifiedRankedShapedTypeDims reifiedRetShapes;
+  LogicalResult status =
+      cast<ReifyRankedShapedTypeOpInterface>(unpackOp.getOperation())
+          .reifyResultShapes(rewriter, reifiedRetShapes);
+  if (status.failed()) {
+    LDBG("Unable to reify result shapes of " << unpackOp);
+    return failure();
+  }
+  Location loc = unpackOp->getLoc();
+
+  auto padValue = rewriter.create<arith::ConstantOp>(
+      loc, rewriter.getZeroAttr(unpackOp.getSourceType().getElementType()));
+
+  // Read result, mask if necessary. If transferReadOp shape is not equal
+  // to shape of source, then a mask is necessary.
+  Value readResult = createReadOrMaskedRead(
+      rewriter, loc, unpackOp.getSource(),
+      ArrayRef<int64_t>(readMaskShape.begin(), readMaskShape.end()), padValue);
+
+  PackingMetadata packMetadata;
+  SmallVector<int64_t> lastDimToInsertPosPerm =
+      tensor::getUnPackInverseSrcPerm(unpackOp, packMetadata);
+  ShapedType maskedOpShapedType = cast<ShapedType>(readResult.getType());
+  SmallVector<int64_t> stripMineShape(maskedOpShapedType.getShape());
+  mlir::Type stripMineElemType = maskedOpShapedType.getElementType();
+  applyPermutationToVector(stripMineShape, lastDimToInsertPosPerm);
+  RankedTensorType stripMineTensorType =
+      RankedTensorType::get(stripMineShape, stripMineElemType);
+  // Transpose the appropriate rows to match output.
+  vector::TransposeOp transposeOp = rewriter.create<vector::TransposeOp>(
+      loc, readResult, lastDimToInsertPosPerm);
+
+  // Collapse the vector to the size required by result.
+  RankedTensorType collapsedType = tensor::CollapseShapeOp::inferCollapsedType(
+      stripMineTensorType, packMetadata.reassociations);
+  mlir::VectorType vecCollapsedType =
+      VectorType::get(collapsedType.getShape(), collapsedType.getElementType());
+  vector::ShapeCastOp shapeCastOp = rewriter.create<vector::ShapeCastOp>(
+      loc, vecCollapsedType, transposeOp->getResult(0));
+
+  // WriteMaskShape had to match the shapecast shape for dynamic sizes,
+  // otherwise the validator complains that the mask size is invalid.
+  SmallVector<int64_t> writeMaskShape(
+      unpackOp.getDestType().hasStaticShape()
+          ? inputVectorSizes
+          : shapeCastOp.getResultVectorType().getShape());
+  Operation *write =
+      createWriteOrMaskedWrite(rewriter, loc, shapeCastOp.getResult(),
+                               reifiedRetShapes[0], writeMaskShape);
+  newResults.push_back(write->getResult(0));
+  return success();
+}
+
 /// Vectorize a `padOp` with (1) static result type, (2) constant padding value
 /// and (3) all-zero lowPad to
 ///   `transfer_write_in_bounds(transfer_read_masked(pad_source, pad_value))`.
@@ -1655,6 +1760,25 @@ isValidMaskedInputVector(ArrayRef<int64_t> shape,
   return success();
 }
 
+/// Need to check if the inner-tiles are static/constant.
+static LogicalResult
+vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp,
+                              ArrayRef<int64_t> inputVectorSizes) {
+
+  if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) {
+        return !getConstantIntValue(res).has_value();
+      })) {
+    LDBG("Inner-tiles must be constant: " << unpackOp << "\n");
+    return failure();
+  }
+  llvm::ArrayRef<int64_t> resultShape = unpackOp.getDestType().getShape();
+  if (!inputVectorSizes.empty() &&
+      failed(isValidMaskedInputVector(resultShape, inputVectorSizes)))
+    return failure();
+
+  return success();
+}
+
 static LogicalResult
 vectorizeLinalgOpPrecondition(LinalgOp linalgOp,
                               ArrayRef<int64_t> inputVectorSizes,
@@ -1703,9 +1827,10 @@ vectorizeLinalgOpPrecondition(LinalgOp linalgOp,
   }
   if (isElementwise(linalgOp))
     return success();
-  // TODO: isaConvolutionOpInterface that can also infer from generic features.
-  // But we will still need stride/dilation attributes that will be annoying to
-  // reverse-engineer...
+
+  // TODO: isaConvolutionOpInterface that can also infer from generic
+  // features. But we will still need stride/dilation attributes that will be
+  // annoying to reverse-engineer...
   if (isa<ConvolutionOpInterface>(linalgOp.getOperation()))
     return success();
   // TODO: the common vector shape is equal to the static loop sizes only when
@@ -1810,6 +1935,9 @@ LogicalResult mlir::linalg::vectorizeOpPrecondition(
       .Case<tensor::PackOp>([&](auto packOp) {
         return vectorizePackOpPrecondition(packOp, inputVectorSizes);
       })
+      .Case<tensor::UnPackOp>([&](auto unpackOp) {
+        return vectorizeUnPackOpPrecondition(unpackOp, inputVectorSizes);
+      })
       .Default([](auto) { return failure(); });
 }
 
@@ -1829,11 +1957,11 @@ static void convertAffineApply(RewriterBase &rewriter, LinalgOp linalgOp) {
 }
 
 /// Emit a suitable vector form for an operation. If provided,
-/// `inputVectorSizes` are used to vectorize this operation. `inputVectorSizes`
-/// must match the rank of the iteration space of the operation and the input
-/// vector sizes must be greater than or equal to their counterpart iteration
-/// space sizes, if static. `inputVectorShapes` also allows the vectorization of
-/// operations with dynamic shapes.
+/// `inputVectorSizes` are used to vectorize this operation.
+/// `inputVectorSizes` must match the rank of the iteration space of the
+/// operation and the input vector sizes must be greater than or equal to
+/// their counterpart iteration space sizes, if static. `inputVectorShapes`
+/// also allows the vectorization of operations with dynamic shapes.
 LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
                                       ArrayRef<int64_t> inputVectorSizes,
                                       ArrayRef<bool> inputScalableVecDims,
@@ -1867,8 +1995,9 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
   auto vectorizeResult =
       TypeSwitch<Operation *, LogicalResult>(op)
           .Case<linalg::LinalgOp>([&](auto linalgOp) {
-            // TODO: isaConvolutionOpInterface that can also infer from generic
-            // features. Will require stride/dilation attributes inference.
+            // TODO: isaConvolutionOpInterface that can also infer from
+            // generic features. Will require stride/dilation attributes
+            // inference.
             if (isa<ConvolutionOpInterface>(linalgOp.getOperation())) {
               FailureOr<Operation *> convOr = vectorizeConvolution(
                   rewriter, linalgOp, flatten1DDepthwiseConv);
@@ -1902,6 +2031,10 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
             return vectorizeAsTensorPackOp(rewriter, packOp, inputVectorSizes,
                                            results);
           })
+          .Case<tensor::UnPackOp>([&](auto unpackOp) {
+            return vectorizeAsTensorUnpackOp(rewriter, unpackOp,
+                                             inputVectorSizes, results);
+          })
           .Default([](auto) { return failure(); });
 
   if (failed(vectorizeResult)) {
@@ -1919,7 +2052,6 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
 
 LogicalResult mlir::linalg::vectorizeCopy(RewriterBase &rewriter,
                                           memref::CopyOp copyOp) {
-
   auto srcType = cast<MemRefType>(copyOp.getSource().getType());
   auto dstType = cast<MemRefType>(copyOp.getTarget().getType());
   if (!srcType.hasStaticShape() || !dstType.hasStaticShape())
@@ -2833,8 +2965,8 @@ struct Conv1DGenerator
     Value res = rewriter.create<vector::TransferReadOp>(loc, resType, resShaped,
                                                         resPadding);
 
-    // The base vectorization case for channeled convolution is input: {n,w,c},
-    // weight: {kw,c,f}, output: {n,w,f}. To reuse the base pattern
+    // The base vectorization case for channeled convolution is input:
+    // {n,w,c}, weight: {kw,c,f}, output: {n,w,f}. To reuse the base pattern
     // vectorization case, we do pre transpose on input, weight, and output.
     switch (conv1DOpOrder) {
     case Conv1DOpOrder::W:
@@ -2877,9 +3009,9 @@ struct Conv1DGenerator
       return kw * (wSize / wSizeStep) + w;
     };
 
-    // Compute contraction: O{n, w, f} += I{n, sw * w + dw * kw, c} * F{c, f} or
-    // perform outerproduct for non-channeled convolution or
-    // perform simple arith operation for pooling
+    // Compute contraction: O{n, w, f} += I{n, sw * w + dw * kw, c} * F{c, f}
+    // or perform outerproduct for non-channeled convolution or perform simple
+    // arith operation for pooling
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       for (int64_t w = 0; w < wSize; w += wSizeStep) {
         switch (oper) {
@@ -2908,9 +3040,9 @@ struct Conv1DGenerator
     // End vector-only rewrite part
     //===------------------------------------------------------------------===//
 
-    // The base vectorization case for channeled convolution is output: {n,w,f}
-    // To reuse the result from base pattern vectorization case, we post
-    // transpose the base case result.
+    // The base vectorization case for channeled convolution is output:
+    // {n,w,f} To reuse the result from base pattern vectorization case, we
+    // post transpose the base case result.
     switch (conv1DOpOrder) {
     case Conv1DOpOrder::W:
     case Conv1DOpOrder::Nwc:
@@ -3348,9 +3480,9 @@ static FailureOr<Operation *>
 vectorizeConvolution(RewriterBase &rewriter, LinalgOp op,
                      bool flatten1DDepthwiseConv) {
   // The ConvolutionOpInterface gives us guarantees of existence for
-  // strides/dilations. However, we do not need to rely on those, we can simply
-  // use them if present, otherwise use the default and let the generic conv.
-  // matcher in the ConvGenerator succeed or fail.
+  // strides/dilations. However, we do not need to rely on those, we can
+  // simply use them if present, otherwise use the default and let the generic
+  // conv. matcher in the ConvGenerator succeed or fail.
   auto strides = op->getAttrOfType<DenseIntElementsAttr>("strides");
   auto dilations = op->getAttrOfType<DenseIntElementsAttr>("dilations");
   auto stride = strides ? *strides.getValues<uint64_t>().begin() : 1;
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index 838255cf5a5ba3..50163880e85f96 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -24,7 +24,6 @@
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -34,7 +33,6 @@
 #include <iterator>
 #include <numeric>
 #include <optional>
-#include <string>
 #include <utility>
 
 #define DEBUG_TYPE "mesh-ops"
@@ -244,6 +242,11 @@ void MeshShapeOp::build(OpBuilder &odsBuilder, OperationState &odsState,
         MeshAxesAttr::get(odsBuilder.getContext(), axes));
 }
 
+void MeshShapeOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResults()[0], "mesh_shape");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.shard attr
 //===----------------------------------------------------------------------===//
@@ -307,6 +310,15 @@ bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
                       std::mem_fn(&MeshAxesAttr::empty));
 }
 
+//===----------------------------------------------------------------------===//
+// mesh.shard op
+//===----------------------------------------------------------------------===//
+
+void ShardOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "sharding_annotated");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.process_multi_index op
 //===----------------------------------------------------------------------===//
@@ -345,6 +357,11 @@ void ProcessMultiIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState,
         MeshAxesAttr::get(odsBuilder.getContext(), axes));
 }
 
+void ProcessMultiIndexOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResults()[0], "proc_linear_idx");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.process_linear_index op
 //===----------------------------------------------------------------------===//
@@ -363,6 +380,11 @@ void ProcessLinearIndexOp::build(OpBuilder &odsBuilder,
   build(odsBuilder, odsState, mesh.getSymName());
 }
 
+void ProcessLinearIndexOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "proc_linear_idx");
+}
+
 //===----------------------------------------------------------------------===//
 // collective communication ops
 //===----------------------------------------------------------------------===//
@@ -606,6 +628,11 @@ void AllGatherOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<AllGatherOp>>(context);
 }
 
+void AllGatherOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "all_gather");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.all_reduce op
 //===----------------------------------------------------------------------===//
@@ -620,6 +647,11 @@ void AllReduceOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<AllReduceOp>>(context);
 }
 
+void AllReduceOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "all_reduce");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.all_slice op
 //===----------------------------------------------------------------------===//
@@ -654,6 +686,11 @@ void AllSliceOp::build(OpBuilder &odsBuilder, OperationState &odsState,
         APInt(sizeof(sliceAxis) * CHAR_BIT, sliceAxis));
 }
 
+void AllSliceOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "all_slice");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.all_to_all op
 //===----------------------------------------------------------------------===//
@@ -674,6 +711,11 @@ void AllToAllOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<AllToAllOp>>(context);
 }
 
+void AllToAllOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "all_to_all");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.broadcast op
 //===----------------------------------------------------------------------===//
@@ -698,6 +740,11 @@ void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<BroadcastOp>>(context);
 }
 
+void BroadcastOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "broadcast");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.gather op
 //===----------------------------------------------------------------------===//
@@ -724,6 +771,11 @@ void GatherOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<GatherOp>>(context);
 }
 
+void GatherOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "gather");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.recv op
 //===----------------------------------------------------------------------===//
@@ -747,6 +799,10 @@ void RecvOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<RecvOp>>(context);
 }
 
+void RecvOp::getAsmResultNames(function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "recv");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.reduce op
 //===----------------------------------------------------------------------===//
@@ -770,6 +826,11 @@ void ReduceOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<ReduceOp>>(context);
 }
 
+void ReduceOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "reduce");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.reduce_scatter op
 //===----------------------------------------------------------------------===//
@@ -791,6 +852,11 @@ void ReduceScatterOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<ReduceScatterOp>>(context);
 }
 
+void ReduceScatterOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "reduce_scatter");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.scatter op
 //===----------------------------------------------------------------------===//
@@ -817,6 +883,11 @@ void ScatterOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<ScatterOp>>(context);
 }
 
+void ScatterOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "scatter");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.send op
 //===----------------------------------------------------------------------===//
@@ -839,6 +910,10 @@ void SendOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<EmptyMeshAxesCanonicalizationPattern<SendOp>>(context);
 }
 
+void SendOp::getAsmResultNames(function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "send");
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.shift op
 //===----------------------------------------------------------------------===//
@@ -865,6 +940,11 @@ void ShiftOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   // offset % shift_axis_mesh_dim_size == 0.
 }
 
+void ShiftOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "shift");
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 53e78d2c28b1d7..af7b85d458774d 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -182,7 +182,7 @@ StorageLayout::getFieldIndexAndStride(SparseTensorFieldKind kind,
   unsigned stride = 1;
   if (kind == SparseTensorFieldKind::CrdMemRef) {
     assert(lvl.has_value());
-    const Level cooStart = SparseTensorType(enc).getCOOStart();
+    const Level cooStart = SparseTensorType(enc).getAoSCOOStart();
     const Level lvlRank = enc.getLvlRank();
     if (lvl.value() >= cooStart && lvl.value() < lvlRank) {
       lvl = cooStart;
@@ -811,10 +811,10 @@ bool mlir::sparse_tensor::SparseTensorType::isCOOType(Level startLvl,
   return !isUnique || isUniqueLvl(lvlRank - 1);
 }
 
-Level mlir::sparse_tensor::SparseTensorType::getCOOStart() const {
+Level mlir::sparse_tensor::SparseTensorType::getAoSCOOStart() const {
   SmallVector<COOSegment> coo = getCOOSegments();
-  if (!coo.empty()) {
-    assert(coo.size() == 1);
+  assert(coo.size() == 1 || coo.empty());
+  if (!coo.empty() && coo.front().isAoS()) {
     return coo.front().lvlRange.first;
   }
   return lvlRank;
@@ -1051,7 +1051,7 @@ static SparseTensorEncodingAttr
 getNormalizedEncodingForSpecifier(SparseTensorEncodingAttr enc) {
   SmallVector<LevelType> lts;
   for (auto lt : enc.getLvlTypes())
-    lts.push_back(lt.stripProperties());
+    lts.push_back(lt.stripStorageIrrelevantProperties());
 
   return SparseTensorEncodingAttr::get(
       enc.getContext(), lts,
@@ -1137,7 +1137,7 @@ static LogicalResult verifyPackUnPack(Operation *op, bool requiresStaticShape,
     return op->emitError("the sparse-tensor must have an encoding attribute");
 
   // Verifies the trailing COO.
-  Level cooStartLvl = stt.getCOOStart();
+  Level cooStartLvl = stt.getAoSCOOStart();
   if (cooStartLvl < stt.getLvlRank()) {
     // We only supports trailing COO for now, must be the last input.
     auto cooTp = llvm::cast<ShapedType>(lvlTps.back());
@@ -1452,7 +1452,7 @@ LogicalResult ToCoordinatesOp::verify() {
 
 LogicalResult ToCoordinatesBufferOp::verify() {
   auto stt = getSparseTensorType(getTensor());
-  if (stt.getCOOStart() >= stt.getLvlRank())
+  if (stt.getAoSCOOStart() >= stt.getLvlRank())
     return emitError("expected sparse tensor with a COO region");
   return success();
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp
index 9414d81e6bf5c6..cd6b9b49893731 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp
@@ -22,13 +22,9 @@ using namespace sparse_tensor;
 // Helper methods.
 //===----------------------------------------------------------------------===//
 
-// TODO: reuse StorageLayout::foreachField?
-
-// TODO: we need COO AoS and SoA
-
 // Convert type range to new types range, with sparse tensors externalized.
-void convTypes(TypeRange types, SmallVectorImpl<Type> &convTypes,
-               SmallVectorImpl<Type> *extraTypes = nullptr) {
+static void convTypes(TypeRange types, SmallVectorImpl<Type> &convTypes,
+                      SmallVectorImpl<Type> *extraTypes = nullptr) {
   for (auto type : types) {
     // All "dense" data passes through unmodified.
     if (!getSparseTensorEncoding(type)) {
@@ -42,29 +38,30 @@ void convTypes(TypeRange types, SmallVectorImpl<Type> &convTypes,
     convTypes.push_back(vtp);
     if (extraTypes)
       extraTypes->push_back(vtp);
-    // Convert the external representations of the pos/crd arrays.
-    for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) {
-      const auto lt = stt.getLvlType(lvl);
-      if (isCompressedLT(lt) || isLooseCompressedLT(lt)) {
-        auto ptp = RankedTensorType::get(shape, stt.getPosType());
-        auto ctp = RankedTensorType::get(shape, stt.getCrdType());
-        convTypes.push_back(ptp);
-        convTypes.push_back(ctp);
-        if (extraTypes) {
-          extraTypes->push_back(ptp);
-          extraTypes->push_back(ctp);
-        }
-      } else {
-        assert(isDenseLT(lt)); // TODO: handle other cases
+
+    // Convert the external representation of the position/coordinate array.
+    foreachFieldAndTypeInSparseTensor(stt, [&convTypes, extraTypes](
+                                               Type t, FieldIndex,
+                                               SparseTensorFieldKind kind,
+                                               Level, LevelType) {
+      if (kind == SparseTensorFieldKind::CrdMemRef ||
+          kind == SparseTensorFieldKind::PosMemRef) {
+        ShapedType st = t.cast<ShapedType>();
+        auto rtp = RankedTensorType::get(st.getShape(), st.getElementType());
+        convTypes.push_back(rtp);
+        if (extraTypes)
+          extraTypes->push_back(rtp);
       }
-    }
+      return true;
+    });
   }
 }
 
 // Convert input and output values to [dis]assemble ops for sparse tensors.
-void convVals(OpBuilder &builder, Location loc, TypeRange types,
-              ValueRange fromVals, ValueRange extraVals,
-              SmallVectorImpl<Value> &toVals, unsigned extra, bool isIn) {
+static void convVals(OpBuilder &builder, Location loc, TypeRange types,
+                     ValueRange fromVals, ValueRange extraVals,
+                     SmallVectorImpl<Value> &toVals, unsigned extra,
+                     bool isIn) {
   unsigned idx = 0;
   for (auto type : types) {
     // All "dense" data passes through unmodified.
@@ -85,29 +82,28 @@ void convVals(OpBuilder &builder, Location loc, TypeRange types,
     if (!isIn) {
       inputs.push_back(extraVals[extra++]);
       retTypes.push_back(RankedTensorType::get(shape, stt.getElementType()));
-      cntTypes.push_back(builder.getIndexType());
+      cntTypes.push_back(builder.getIndexType()); // nnz
     }
+
     // Collect the external representations of the pos/crd arrays.
-    for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) {
-      const auto lt = stt.getLvlType(lvl);
-      if (isCompressedLT(lt) || isLooseCompressedLT(lt)) {
+    foreachFieldAndTypeInSparseTensor(stt, [&, isIn](Type t, FieldIndex,
+                                                     SparseTensorFieldKind kind,
+                                                     Level, LevelType) {
+      if (kind == SparseTensorFieldKind::CrdMemRef ||
+          kind == SparseTensorFieldKind::PosMemRef) {
         if (isIn) {
           inputs.push_back(fromVals[idx++]);
-          inputs.push_back(fromVals[idx++]);
         } else {
-          Type pTp = stt.getPosType();
-          Type cTp = stt.getCrdType();
-          inputs.push_back(extraVals[extra++]);
+          ShapedType st = t.cast<ShapedType>();
+          auto rtp = RankedTensorType::get(st.getShape(), st.getElementType());
           inputs.push_back(extraVals[extra++]);
-          retTypes.push_back(RankedTensorType::get(shape, pTp));
-          retTypes.push_back(RankedTensorType::get(shape, cTp));
-          cntTypes.push_back(pTp);
-          cntTypes.push_back(cTp);
+          retTypes.push_back(rtp);
+          cntTypes.push_back(rtp.getElementType());
         }
-      } else {
-        assert(isDenseLT(lt)); // TODO: handle other cases
       }
-    }
+      return true;
+    });
+
     if (isIn) {
       // Assemble multiple inputs into a single sparse tensor.
       auto a = builder.create<sparse_tensor::AssembleOp>(loc, rtp, inputs);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index d4459c6ea1e521..0ccb11f3a6b858 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -194,7 +194,7 @@ static void createAllocFields(OpBuilder &builder, Location loc,
       valHeuristic =
           builder.create<arith::MulIOp>(loc, valHeuristic, lvlSizesValues[lvl]);
   } else if (sizeHint) {
-    if (stt.getCOOStart() == 0) {
+    if (stt.getAoSCOOStart() == 0) {
       posHeuristic = constantIndex(builder, loc, 2);
       crdHeuristic = builder.create<arith::MulIOp>(
           loc, constantIndex(builder, loc, lvlRank), sizeHint); // AOS
@@ -1316,7 +1316,7 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
     Value posBack = c0; // index to the last value in the position array
     Value memSize = c1; // memory size for current array
 
-    Level trailCOOStart = stt.getCOOStart();
+    Level trailCOOStart = stt.getAoSCOOStart();
     Level trailCOORank = stt.getLvlRank() - trailCOOStart;
     // Sets up SparseTensorSpecifier.
     for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) {
@@ -1453,7 +1453,7 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
     const auto dstTp = getSparseTensorType(op.getResult());
     // Creating COO with NewOp is handled by direct IR codegen. All other cases
     // are handled by rewriting.
-    if (!dstTp.hasEncoding() || dstTp.getCOOStart() != 0)
+    if (!dstTp.hasEncoding() || dstTp.getAoSCOOStart() != 0)
       return failure();
 
     // Implement as follows:
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 7326a6a3811284..2ccb2361b5efe1 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1180,7 +1180,7 @@ struct NewRewriter : public OpRewritePattern<NewOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     auto stt = getSparseTensorType(op.getResult());
-    if (!stt.hasEncoding() || stt.getCOOStart() == 0)
+    if (!stt.hasEncoding() || stt.getAoSCOOStart() == 0)
       return failure();
 
     // Implement the NewOp as follows:
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
index 75a43891491879..b888dfadb9c714 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
@@ -568,7 +568,7 @@ Value sparse_tensor::genToCoordinates(OpBuilder &builder, Location loc,
   const auto srcTp = getSparseTensorType(tensor);
   const Type crdTp = srcTp.getCrdType();
   const Type memTp =
-      get1DMemRefType(crdTp, /*withLayout=*/lvl >= srcTp.getCOOStart());
+      get1DMemRefType(crdTp, /*withLayout=*/lvl >= srcTp.getAoSCOOStart());
   return builder.create<ToCoordinatesOp>(loc, memTp, tensor,
                                          builder.getIndexAttr(lvl));
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
index 3ab4157475cd4c..6ac26ad550f9f3 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
@@ -103,7 +103,7 @@ void SparseTensorSpecifier::setSpecifierField(OpBuilder &builder, Location loc,
 
 Value sparse_tensor::SparseTensorDescriptor::getCrdMemRefOrView(
     OpBuilder &builder, Location loc, Level lvl) const {
-  const Level cooStart = rType.getCOOStart();
+  const Level cooStart = rType.getAoSCOOStart();
   if (lvl < cooStart)
     return getMemRefField(SparseTensorFieldKind::CrdMemRef, lvl);
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
index 3a61ec7a2236f3..c2f631605bf4b2 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
@@ -137,7 +137,7 @@ class SparseTensorDescriptorImpl {
   }
 
   Value getAOSMemRef() const {
-    const Level cooStart = rType.getCOOStart();
+    const Level cooStart = rType.getAoSCOOStart();
     assert(cooStart < rType.getLvlRank());
     return getMemRefField(SparseTensorFieldKind::CrdMemRef, cooStart);
   }
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index f20008a1ed2b2f..186f85d2ce20a6 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -72,36 +72,73 @@ mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType,
       RTTBuilder(rankedTensorType).setShape(transposedShape);
   return transposedTensorType;
 }
-
-SmallVector<int64_t>
-mlir::tensor::getPackInverseDestPermutation(PackOp packOp) {
-  // The permutation can be obtained from two permutations:
-  //   a) Compute the permutation vector to move the last `numPackedDims` into
-  //      the `innerPosDims` of a shape of rank `packedRank`.
-  //   b) Compute the permutation vector to move outer dims if the pack op
-  //      has outer_dims_perm.
-  // Apply (b) permutation on (a) permutation to get the final permutation.
-  int64_t numPackedDims = packOp.getInnerDimsPos().size();
-  int64_t packedRank = packOp.getDestType().getRank();
-  auto lastDims = llvm::to_vector(
-      llvm::seq<int64_t>(packedRank - numPackedDims, packedRank));
-  PackingMetadata packingMetadata = computePackingMetadata(
-      packOp.getDestType().getRank(), packOp.getInnerDimsPos());
-  SmallVector<int64_t> innerPositionsPerm = computePermutationVector(
-      packedRank, lastDims, packingMetadata.insertPositions);
+/// The permutation can be obtained from two permutations:
+///   a) Compute the permutation vector to move the last `numPackedDims` into
+///      the `innerPosDims` of a shape of rank `rank`.
+///   b) Compute the permutation vector to move outer dims if the
+///      `outerPerm` parameter is not empty.
+/// Apply (b) permutation on (a) permutation to get the final permutation.
+static SmallVector<int64_t>
+computePackUnPackPerm(int64_t rank, ArrayRef<int64_t> &innerDimsPos,
+                      ArrayRef<int64_t> &outerPerm,
+                      PackingMetadata &packingMetadata) {
+  int64_t numPackedDims = innerDimsPos.size();
+  auto lastDims =
+      llvm::to_vector(llvm::seq<int64_t>(rank - numPackedDims, rank));
+  packingMetadata = computePackingMetadata(rank, innerDimsPos);
+  SmallVector<int64_t> innerPositionsPerm =
+      computePermutationVector(rank, lastDims, packingMetadata.insertPositions);
 
   SmallVector<int64_t> outerPos = packingMetadata.outerPositions;
-  ArrayRef<int64_t> outerPerm = packOp.getOuterDimsPerm();
   if (!outerPerm.empty())
     applyPermutationToVector(outerPos, outerPerm);
-  SmallVector<int64_t> outerPositionPerm = computePermutationVector(
-      packedRank, packingMetadata.outerPositions, outerPos);
+  SmallVector<int64_t> outerPositionPerm =
+      computePermutationVector(rank, packingMetadata.outerPositions, outerPos);
 
   SmallVector<int64_t> packInverseDestPermutation = innerPositionsPerm;
   applyPermutationToVector(packInverseDestPermutation, outerPositionPerm);
   return packInverseDestPermutation;
 }
 
+/// Shell function to compute the Destination Permutation of PackOp
+/// This function uses the helper function `computePackUnPackPerm` to get
+/// the permutation vector. Only major difference between UnPack and Pack is
+/// that packOp uses destination rank whereas unpack Uses source rank.
+SmallVector<int64_t> mlir::tensor::getPackInverseDestPerm(PackOp packOp) {
+
+  PackingMetadata pMetadata;
+  int64_t packedRank = packOp.getDestType().getRank();
+  ArrayRef<int64_t> innerDimPos = packOp.getInnerDimsPos();
+  ArrayRef<int64_t> outerPerm = packOp.getOuterDimsPerm();
+  SmallVector<int64_t> packInvDestPerm =
+      computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata);
+  return packInvDestPerm;
+}
+
+/// Shell function to compute the Source Permutation of unPackOp.
+/// This function, like the getPackInverseDestPerm uses the helper function
+/// computePackUnPackPerm` to get the permutation vector.
+/// Only major difference between UnPack and Pack is that packOp uses
+/// destination rank whereas unpack Uses source rank.
+SmallVector<int64_t> mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp) {
+  PackingMetadata metadata;
+  return mlir::tensor::getUnPackInverseSrcPerm(unpackOp, metadata);
+}
+
+/// Shell function to compute the Source rank permutation for unpackOp
+/// Unpack requires some packing metadata data information, so created
+/// another function where this value is passed by reference.
+SmallVector<int64_t>
+mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp,
+                                      PackingMetadata &metadata) {
+  int64_t unpackRank = unpackOp.getSourceType().getRank();
+  ArrayRef<int64_t> innerDimPos = unpackOp.getInnerDimsPos();
+  ArrayRef<int64_t> outerPerm = unpackOp.getOuterDimsPerm();
+  SmallVector<int64_t> unpackInvSrcPerm =
+      computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata);
+  return unpackInvSrcPerm;
+}
+
 bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) {
   llvm::SmallBitVector droppedDims = op.getDroppedDims();
   int64_t srcDim = 0;
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 721f35162ef867..7adde3117deeaa 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1029,3 +1029,38 @@ func.func @broadcast_same_shape(%input: tensor<2x3xf32>, %init: tensor<2x3xf32>)
   %0 = linalg.broadcast ins(%input: tensor<2x3xf32>) outs(%init: tensor<2x3xf32>) dimensions = []
   return %0 : tensor<2x3xf32>
 }
+
+// ----
+
+func.func @transpose_1d(%input: tensor<16xf32>,
+                        %init: tensor<16xf32>) -> tensor<16xf32> {
+  %transpose = linalg.transpose
+      ins(%input:tensor<16xf32>)
+      outs(%init:tensor<16xf32>)
+      permutation = [0]
+  func.return %transpose : tensor<16xf32>
+}
+
+// CHECK-LABEL: func @transpose_1d(
+//  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<16xf32>,
+//  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: tensor<16xf32>)
+//   CHECK-NOT:   linalg.transpose
+//       CHECK:   return %[[INPUT]] : tensor<16xf32>
+
+// -----
+
+func.func @transpose_identity_perm(%input: tensor<16x32x64xf32>,
+                                   %init: tensor<16x32x64xf32>) -> tensor<16x32x64xf32> {
+  %transpose = linalg.transpose
+      ins(%input:tensor<16x32x64xf32>)
+      outs(%init:tensor<16x32x64xf32>)
+      permutation = [0, 1, 2]
+  func.return %transpose : tensor<16x32x64xf32>
+}
+
+// CHECK-LABEL: func @transpose_identity_perm(
+//  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<16x32x64xf32>,
+//  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: tensor<16x32x64xf32>)
+//   CHECK-NOT:   linalg.transpose
+//       CHECK:   return %[[INPUT]] : tensor<16x32x64xf32>
+
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
index d63433248ab1e0..0a197a0ee9fa68 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
@@ -48,12 +48,8 @@ func.func @pad_and_pack(%arg0: tensor<13x15xf32>, %arg1: tensor<2x8x8x2xf32>, %a
 // CHECK:             %[[PAD:.+]] = tensor.pad %[[SRC_SLICE]]
 // CHECK:               tensor.yield %[[PAD_VAL]]
 // CHECK:             } : tensor<?x?xf32> to tensor<8x2xf32>
-// CHECK:             %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32>
-// CHECK:         %[[TRANSP:.+]] = linalg.transpose
-// CHECK-SAME:      ins(%[[PAD]] : tensor<8x2xf32>)
-// CHECK-SAME:      outs(%[[EMPTY]] : tensor<8x2xf32>)
-// CHECK-SAME:      permutation = [0, 1]
-// CHECK:         %{{.+}} = tensor.insert_slice %[[TRANSP]] into %{{.+}}
+// CHECK-NOT:         linalg.transpose
+// CHECK:             %{{.+}} = tensor.insert_slice %[[PAD]] into %{{.+}}
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -81,12 +77,8 @@ func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>)
 // CHECK-DAG:         %[[IN_C:.+]] = affine.apply #[[MAP2]](%[[C]])
 // CHECK:             %[[TILE:.+]] = tensor.extract_slice %[[SRC]]
 // CHECK-SAME:          [%[[IN_K]], %[[IN_C]]] [32, 8] [1, 1]
-// CHECK:             %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32>
-// CHECK:             %[[TRANSP:.+]] =  linalg.transpose
-// CHECK-SAME:          ins(%[[TILE]]
-// CHECK-SAME:          outs(%[[EMPTY]]
-// CHECK-SAME:          permutation = [0, 1]
-// CHECK:             %[[SUB_ITER:.+]] = tensor.insert_slice %[[TRANSP]] into %{{[a-zA-Z0-9]+}}
+// CHECK-NOT:         linalg.transpose
+// CHECK:             %[[SUB_ITER:.+]] = tensor.insert_slice %[[TILE]] into %{{[a-zA-Z0-9]+}}
 // CHECK-SAME:          [0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<32x8xf32> into tensor<1x1x32x8xf32>
 // CHECK:             %{{.+}} = tensor.insert_slice %[[SUB_ITER]] into %{{[a-zA-Z0-9]+}}
 // CHECK-SAME:          [%[[C]], %[[K]], 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<1x1x32x8xf32> into tensor<32x4x32x8xf32>
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir
index eaad6bd8270476..7d87a0994004fe 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir
@@ -29,12 +29,8 @@ func.func @simple_pad_and_pack(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2x
 // CHECK-SAME:    %[[PAD_VAL:[a-zA-Z0-9]+]]
 // CHECK:         %[[PAD:.+]] = tensor.pad %[[SRC]] low[0, 0] high[3, 1]
 // CHECK:           tensor.yield %[[PAD_VAL]]
-// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32>
-// CHECK:         %[[TRANSP:.+]] = linalg.transpose
-// CHECK-SAME:      ins(%[[PAD]] : tensor<8x2xf32>)
-// CHECK-SAME:      outs(%[[EMPTY]] : tensor<8x2xf32>)
-// CHECK-SAME:      permutation = [0, 1]
-// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
+// CHECK-NOT:     linalg.transpose
+// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[PAD]] into %[[DEST]]
 // CHECK-SAME:      [0, 0, 0, 0] [1, 1, 8, 2] [1, 1, 1, 1]
 // CHECK:         return %[[INSERT]]
 
@@ -47,12 +43,8 @@ func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32
 // CHECK-LABEL: func.func @simple_NC_to_CNnc
 // CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
 // CHECK-SAME:    %[[DEST:[a-zA-Z0-9]+]]
-// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32>
-// CHECK:         %[[TRANSP:.+]] =  linalg.transpose
-// CHECK-SAME:      ins(%[[SRC]] : tensor<32x8xf32>)
-// CHECK-SAME:      outs(%[[EMPTY]] : tensor<32x8xf32>)
-// CHECK-SAME:      permutation = [0, 1]
-// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
+// CHECK-NOT:     linalg.transpose
+// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[SRC]] into %[[DEST]]
 // CHECK-SAME:      [0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1]
 // CHECK:         return %[[INSERT]]
 
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
index f0d4b790520e03..7d64331c987841 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
@@ -57,12 +57,8 @@ func.func @unpack_and_extract_slice(%arg0: tensor<2x8x8x2xf32>, %arg1: tensor<13
 // CHECK-SAME:          [%[[I]], %[[J]]] [%[[OUT_I_SZ]], %[[OUT_J_SZ]]]
 // CHECK:             %[[TILE:.+]] = tensor.extract_slice %[[SRC_SLICE]]
 // CHECK-SAME:          [0, 0, 0, 0] [1, 1, 8, 2] [1, 1, 1, 1] : tensor<1x1x8x2xf32> to tensor<8x2xf32>
-// CHECK:             %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32>
-// CHECK:             %[[TRANSP:.+]] =  linalg.transpose
-// CHECK-SAME:          ins(%[[TILE]] : tensor<8x2xf32>)
-// CHECK-SAME:          outs(%[[EMPTY]] : tensor<8x2xf32>)
-// CHECK-SAME:          permutation = [0, 1]
-// CHECK:             %[[UNPACK_TILE:.+]] = tensor.extract_slice %[[TRANSP]]
+// CHECK-NOT:         linalg.transpose
+// CHECK:             %[[UNPACK_TILE:.+]] = tensor.extract_slice %[[TILE]]
 // CHECK-SAME:          [0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]]] [1, 1]
 // CHECK:             %[[INSERT1:.+]] = tensor.insert_slice %[[UNPACK_TILE]] into %[[ITER_SLICE]]
 // CHECK-SAME:          [0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]]] [1, 1]
@@ -96,12 +92,8 @@ func.func @CKkc_to_KC(%arg0: tensor<32x4x32x8xf32>, %arg1: tensor<128x256xf32>)
 // CHECK-SAME:          [%[[IN_C]], %[[IN_K]], 0, 0] [1, 1, 32, 8] [1, 1, 1, 1]
 // CHECK:             %[[TILE:.+]] = tensor.extract_slice %[[SRC_SLICE]]
 // CHECK-SAME:          [0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<1x1x32x8xf32> to tensor<32x8xf32>
-// CHECK:             %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32>
-// CHECK:             %[[TRANSP:.+]] =  linalg.transpose
-// CHECK-SAME:          ins(%[[TILE]]
-// CHECK-SAME:          outs(%[[EMPTY]]
-// CHECK-SAME:          permutation = [0, 1]
-// CHECK:             %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %{{[a-zA-Z0-9]+}}
+// CHECK-NOT:         linalg.transpose
+// CHECK:             %[[INSERT:.+]] = tensor.insert_slice %[[TILE]] into %{{[a-zA-Z0-9]+}}
 // CHECK-SAME:          [%[[K]], %[[C]]] [32, 8] [1, 1]
 
 
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir
index 02376808865006..153ce68b8f086c 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir
@@ -27,14 +27,10 @@ func.func @simple_unpack_and_extract_slice(%input: tensor<1x1x8x2xf32>, %output:
 // CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
 // CHECK-SAME:    %[[DEST:[a-zA-Z0-9]+]]
 // CHECK:         %[[TILE:.+]] = tensor.extract_slice %[[SRC]][0, 0, 0, 0] [1, 1, 8, 2] [1, 1, 1, 1]
-// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32>
-// CHECK:         %[[TRANSP:.+]] =  linalg.transpose
-// CHECK-SAME:      ins(%[[TILE]] : tensor<8x2xf32>)
-// CHECK-SAME:      outs(%[[EMPTY]] : tensor<8x2xf32>)
-// CHECK-SAME:      permutation = [0, 1]
+// CHECK-NOT:     linalg.transpose
 //                They have the same type, so the insert_slice op is folded
 //                away.
-// CHECK:         %[[SLICE:.+]] = tensor.extract_slice %[[TRANSP]][0, 0] [5, 1] [1, 1]
+// CHECK:         %[[SLICE:.+]] = tensor.extract_slice %[[TILE]][0, 0] [5, 1] [1, 1]
 // CHECK:         return %[[SLICE]]
 
 // -----
@@ -47,14 +43,10 @@ func.func @simple_CNnc_to_NC(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<32x8xf32
 // CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
 // CHECK-SAME:    %[[DEST:[a-zA-Z0-9]+]]
 // CHECK:         %[[TILE:.+]] = tensor.extract_slice %[[SRC]][0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1]
-// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32>
-// CHECK:         %[[TRANSP:.+]] =  linalg.transpose
-// CHECK-SAME:      ins(%[[TILE]] : tensor<32x8xf32>)
-// CHECK-SAME:      outs(%[[EMPTY]] : tensor<32x8xf32>)
-// CHECK-SAME:      permutation = [0, 1]
+// CHECK-NOT:     linalg.transpose
 //                They have the same type, so the insert_slice op is folded
 //                away.
-// CHECK:         return %[[TRANSP]]
+// CHECK:         return %[[TILE]]
 
 // -----
 
@@ -75,7 +67,6 @@ func.func @simple_NCHWc_to_NCHW(%arg0: tensor<2x1x16x8x32xf32>, %arg1: tensor<2x
 //                away.
 // CHECK:         return %[[TRANSP]]
 
-
 // -----
 
 func.func @simple_NHWC_to_NCHW(%arg0: tensor<1x16x8x32xf32>, %arg1: tensor<1x32x16x8xf32>) -> tensor<1x32x16x8xf32> {
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 0272ac599aa3db..2d01d57304013c 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -697,3 +697,118 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack
+func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
+// CHECK: %[[C0:.*]] = arith.constant 0
+// CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?x?xf32>
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[DIM0:.*]] = tensor.dim %arg0, %[[C1]] : tensor<?x?xf32>
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+// CHECK: %[[C01:.*]] = arith.constant 0
+// CHECK: %[[C02:.*]] = arith.constant 0
+// CHECK: %[[DIM4:.*]] = tensor.dim %arg1, %[[C02]] : tensor<?x?x16x2xf32>
+// CHECK: %[[CNST14:.*]] = arith.constant 1
+// CHECK: %[[DIM6:.*]] = tensor.dim %arg1, %[[CNST14]] : tensor<?x?x16x2xf32>
+// CHECK: %[[CNST16:.*]] = arith.constant 16 : index
+// CHECK: %[[CNST2:.*]] = arith.constant 2 : index
+// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
+// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
+// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
+// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
+// CHECK: %[[empt0:.*]] = tensor.empty
+// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
+// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]]
+// CHECK: return %[[write0]]
+ %ret = tensor.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+ return %ret : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+   %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op
+   transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_unpack
+func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+    // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK: %[[C0:.*]]= arith.constant 0 : index
+    // CHECK: %[[C8:.*]] = arith.constant 8 : index
+    // CHECK: %[[C80:.*]] = arith.constant 8 : index
+    // CHECK: %[[C32:.*]] = arith.constant 32 : index
+    // CHECK: %[[C16:.*]] = arith.constant 16 : index
+    // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1>
+    // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32>
+    // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32>
+    // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32>
+    // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
+    // CHECK: %[[C01:.*]] = arith.constant 0 : index
+    // CHECK: %[[C256:.*]] = arith.constant 256 : index
+    // CHECK: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1>
+    // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32>
+    // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op
+    transform.yield
+  } 
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_unpack_no_masks
+func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
+  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
+  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+    transform.yield
+  } 
+ }
+
+  // -----
+
+  // CHECK-LABEL: test_vectorize_unpack_with_outer_perm
+  func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
+  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
+  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+    transform.yield
+  } 
+}
diff --git a/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir b/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir
index 677a5982ea2540..e23cfd79a42745 100644
--- a/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir
+++ b/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir
@@ -6,7 +6,7 @@ mesh.mesh @mesh2d(shape = ?x?)
 func.func @multi_index_2d_mesh() -> (index, index) {
   // CHECK: %[[LINEAR_IDX:.*]] = mesh.process_linear_index on @mesh2d : index
   // CHECK: %[[MESH_SHAPE:.*]]:2 = mesh.mesh_shape @mesh2d : index, index
-  // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %0 into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index
+  // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %[[LINEAR_IDX]] into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index
   %0:2 = mesh.process_multi_index on @mesh2d : index, index
   // CHECK: return %[[MULTI_IDX]]#0, %[[MULTI_IDX]]#1 : index, index
   return %0#0, %0#1 : index, index
@@ -16,7 +16,7 @@ func.func @multi_index_2d_mesh() -> (index, index) {
 func.func @multi_index_2d_mesh_single_inner_axis() -> index {
   // CHECK: %[[LINEAR_IDX:.*]] = mesh.process_linear_index on @mesh2d : index
   // CHECK: %[[MESH_SHAPE:.*]]:2 = mesh.mesh_shape @mesh2d : index, index
-  // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %0 into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index
+  // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %[[LINEAR_IDX]] into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index
   %0 = mesh.process_multi_index on @mesh2d axes = [0] : index
   // CHECK: return %[[MULTI_IDX]]#0 : index
   return %0 : index
diff --git a/mlir/test/Dialect/SparseTensor/external.mlir b/mlir/test/Dialect/SparseTensor/external.mlir
index c17ba13e86c926..b5701ad2024264 100644
--- a/mlir/test/Dialect/SparseTensor/external.mlir
+++ b/mlir/test/Dialect/SparseTensor/external.mlir
@@ -100,3 +100,27 @@ func.func @sparse_out2(%arg0: tensor<64x64xf32>) -> (tensor<64x64xf32>, tensor<6
 func.func @sparse_inout(%arg0: tensor<64x64xf32, #sparse>) -> tensor<64x64xf32, #sparse> {
   return %arg0 : tensor<64x64xf32, #sparse>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @sparse_inout_coo_soa(
+// CHECK-SAME:    %[[A:.*0]]: tensor<?xf32>,
+// CHECK-SAME:    %[[B:.*1]]: tensor<?xindex>,
+// CHECK-SAME:    %[[C:.*2]]: tensor<?xindex>,
+// CHECK-SAME:    %[[D:.*3]]: tensor<?xindex>,
+// CHECK-SAME:    %[[E:.*4]]: tensor<?xf32>,
+// CHECK-SAME:    %[[F:.*5]]: tensor<?xindex>,
+// CHECK-SAME:    %[[G:.*6]]: tensor<?xindex>,
+// CHECK-SAME:    %[[H:.*7]]: tensor<?xindex>) -> (tensor<?xf32>, tensor<?xindex>, tensor<?xindex>, tensor<?xindex>) {
+// CHECK:         %[[I:.*]] = sparse_tensor.assemble %[[A]], %[[B]], %[[C]], %[[D]]
+// CHECK:         %[[F:.*]] = call @_internal_sparse_inout_coo_soa(%[[I]])
+// CHECK:         sparse_tensor.disassemble %[[F]]
+// CHECK:         return
+// CHECK:       }
+// CHECK:       func.func private @_internal_sparse_inout
+#sparse = #sparse_tensor.encoding<{
+   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton(soa))
+}>
+func.func @sparse_inout_coo_soa(%arg0: tensor<64x64xf32, #sparse>) -> tensor<64x64xf32, #sparse> {
+  return %arg0 : tensor<64x64xf32, #sparse>
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
new file mode 100644
index 00000000000000..8ee4e1fb48fef1
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
@@ -0,0 +1,121 @@
+// DEFINE: %{compile} =  mlir-opt %s \
+// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE:    -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = mmt4d
+// DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void \
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s
+
+func.func @mmt4d() {
+  // Allocate the matrices
+  %A_alloc = tensor.empty() : tensor<2x2x3x1xi32>
+  %B_alloc = tensor.empty() : tensor<2x2x3x1xi32>
+  %C_alloc = tensor.empty() : tensor<2x2x3x3xi32>
+  %C_in = arith.constant dense<[
+    [[[ 1, 2, 3],
+     [ 4, 5, 6],
+     [ 7, 8, 9]],
+    [[ 11, 12, 13],
+     [ 14, 15, 16],
+     [ 17, 18, 19]]],
+    [[[ 21, 22, 23],
+     [ 24, 25, 26],
+     [ 27, 28, 29]],
+    [[ 31, 32, 33],
+     [ 34, 35, 36],
+     [ 37, 38, 39]]]
+  ]> : tensor<2x2x3x3xi32>
+
+  // Initialise the matrices
+  %three = arith.constant 3 : i32
+  %four = arith.constant 4 : i32
+  %A = linalg.fill ins(%three : i32) outs(%A_alloc : tensor<2x2x3x1xi32>) -> tensor<2x2x3x1xi32>
+  %B = linalg.fill ins(%four : i32) outs(%B_alloc : tensor<2x2x3x1xi32>) -> tensor<2x2x3x1xi32>
+
+  // Matmul
+  %C_out = linalg.mmt4d ins(%A, %B: tensor<2x2x3x1xi32>, tensor<2x2x3x1xi32>) outs(%C_in: tensor<2x2x3x3xi32>) -> tensor<2x2x3x3xi32>
+
+  // Print and verify the output
+  // CHECK:  Unranked Memref {{.*}} rank = 4 offset = 0 sizes = [2, 2, 3, 3] strides = [18, 9, 3, 1] data =
+  // C[0, 0]
+  // CHECK-NEXT: [25,  26, 27]
+  // CHECK-NEXT: [28,  29, 30]
+  // CHECK-NEXT: [31,  32, 33]
+  // C[0, 1]
+  // CHECK-NEXT: [35,  36, 37]
+  // CHECK-NEXT: [38,  39, 40]
+  // CHECK-NEXT: [41,  42, 43]
+  // C[1, 0]
+  // CHECK-NEXT: [45,  46, 47]
+  // CHECK-NEXT: [48,  49, 50]
+  // CHECK-NEXT: [51,  52, 53]
+  // C[1, 1]
+  // CHECK-NEXT: [55,  56, 57]
+  // CHECK-NEXT: [58,  59, 60]
+  // CHECK-NEXT: [61,  62, 63]
+
+  %xf = tensor.cast %C_out : tensor<2x2x3x3xi32> to tensor<*xi32>
+  call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
+
+  return
+}
+
+module @transforms attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+   %mmt4d = transform.collect_matching @match_mmt4d in %module : (!transform.any_op) -> (!transform.any_op)
+   %func = transform.get_parent_op %mmt4d {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
+
+   // Step 1: Tile
+   // Tile parallel dims
+   %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 3, 3, 0]
+     : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+   // Tile reduction dims
+   %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1]
+     : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+   // Step 2: Vectorize
+   transform.structured.vectorize %tiled_linalg_op_r : !transform.any_op
+
+   // Step 3: Simplify
+   // vector.multi_reduction --> vector.contract
+   // Generates a 6-dim vector.contract with the dim matching the original MMT4D Op
+   // and with the following split into parallel and reduction dims:
+   //    * parallel, parallel, reduction, parallel, parallel, reduction
+   transform.apply_patterns to %func {
+     transform.apply_patterns.vector.reduction_to_contract
+     // Reduce the rank of xfer ops. This transforms vector.contract to be
+     // more matmul-like and to enable the lowering to outer product Ops.
+     transform.apply_patterns.vector.transfer_permutation_patterns
+   } : !transform.op<"func.func">
+
+   // Hoisting and LICM - not strictly required
+   %func_h = transform.structured.hoist_redundant_vector_transfers %func
+     : (!transform.op<"func.func">) -> !transform.op<"func.func">
+   %all_loops = transform.structured.match interface{LoopLikeInterface} in %func_h
+     : (!transform.op<"func.func">) -> !transform.any_op
+   transform.apply_licm to %all_loops : !transform.any_op
+   transform.loop.hoist_loop_invariant_subsets %all_loops : !transform.any_op
+
+   // Simplify the 6-dim vector.contract into a 3-dim matmul-like
+   // vector.contract with the following split into parallel and reduction
+   // dims:
+   //    * parallel, parallel, reduction
+   transform.apply_patterns to %func_h {
+     transform.apply_patterns.vector.reduction_to_contract
+     transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+     transform.apply_patterns.canonicalization
+   } : !transform.op<"func.func">
+    transform.yield
+  }
+
+  transform.named_sequence @match_mmt4d(
+      %entry: !transform.any_op {transform.readonly}) -> !transform.any_op {
+    transform.match.operation_name %entry ["linalg.mmt4d"] : !transform.any_op
+    transform.yield %entry : !transform.any_op
+  }
+}
+
+func.func private @printMemrefI32(%ptr : tensor<*xi32>)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
index aaf15ecc681fc2..16252c1005ebbb 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
@@ -34,6 +34,10 @@
   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
 }>
 
+#SortedCOOSoA = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton(soa))
+}>
+
 #CSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
@@ -50,7 +54,7 @@
 
 module {
   func.func @add_coo_csr(%arga: tensor<8x8xf32, #CSR>,
-                         %argb: tensor<8x8xf32, #SortedCOO>)
+                         %argb: tensor<8x8xf32, #SortedCOOSoA>)
 		         -> tensor<8x8xf32> {
     %empty = tensor.empty() : tensor<8x8xf32>
     %zero = arith.constant 0.000000e+00 : f32
@@ -59,7 +63,7 @@ module {
         outs(%empty : tensor<8x8xf32>) -> tensor<8x8xf32>
     %0 = linalg.generic #trait
       ins(%arga, %argb: tensor<8x8xf32, #CSR>,
-                        tensor<8x8xf32, #SortedCOO>)
+                        tensor<8x8xf32, #SortedCOOSoA>)
       outs(%init: tensor<8x8xf32>) {
         ^bb(%a: f32, %b: f32, %x: f32):
           %0 = arith.addf %a, %b : f32
@@ -69,7 +73,7 @@ module {
   }
 
   func.func @add_coo_coo(%arga: tensor<8x8xf32, #SortedCOO>,
-                         %argb: tensor<8x8xf32, #SortedCOO>)
+                         %argb: tensor<8x8xf32, #SortedCOOSoA>)
 		         -> tensor<8x8xf32> {
     %empty = tensor.empty() : tensor<8x8xf32>
     %zero = arith.constant 0.000000e+00 : f32
@@ -78,7 +82,7 @@ module {
         outs(%empty : tensor<8x8xf32>) -> tensor<8x8xf32>
     %0 = linalg.generic #trait
       ins(%arga, %argb: tensor<8x8xf32, #SortedCOO>,
-                        tensor<8x8xf32, #SortedCOO>)
+                        tensor<8x8xf32, #SortedCOOSoA>)
       outs(%init: tensor<8x8xf32>) {
         ^bb(%a: f32, %b: f32, %x: f32):
           %0 = arith.addf %a, %b : f32
@@ -88,12 +92,12 @@ module {
   }
 
   func.func @add_coo_coo_out_coo(%arga: tensor<8x8xf32, #SortedCOO>,
-                                 %argb: tensor<8x8xf32, #SortedCOO>)
+                                 %argb: tensor<8x8xf32, #SortedCOOSoA>)
 		                 -> tensor<8x8xf32, #SortedCOO> {
     %init = tensor.empty() : tensor<8x8xf32, #SortedCOO>
     %0 = linalg.generic #trait
       ins(%arga, %argb: tensor<8x8xf32, #SortedCOO>,
-                        tensor<8x8xf32, #SortedCOO>)
+                        tensor<8x8xf32, #SortedCOOSoA>)
       outs(%init: tensor<8x8xf32, #SortedCOO>) {
         ^bb(%a: f32, %b: f32, %x: f32):
           %0 = arith.addf %a, %b : f32
@@ -104,7 +108,7 @@ module {
 
 
   func.func @add_coo_dense(%arga: tensor<8x8xf32>,
-                           %argb: tensor<8x8xf32, #SortedCOO>)
+                           %argb: tensor<8x8xf32, #SortedCOOSoA>)
   	    	         -> tensor<8x8xf32> {
     %empty = tensor.empty() : tensor<8x8xf32>
     %zero = arith.constant 0.000000e+00 : f32
@@ -113,7 +117,7 @@ module {
         outs(%empty : tensor<8x8xf32>) -> tensor<8x8xf32>
     %0 = linalg.generic #trait
       ins(%arga, %argb: tensor<8x8xf32>,
-                        tensor<8x8xf32, #SortedCOO>)
+                        tensor<8x8xf32, #SortedCOOSoA>)
       outs(%init: tensor<8x8xf32>) {
         ^bb(%a: f32, %b: f32, %x: f32):
           %0 = arith.addf %a, %b : f32
@@ -154,19 +158,19 @@ module {
     %COO_A = sparse_tensor.convert %A
       : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO>
     %COO_B = sparse_tensor.convert %B
-      : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO>
+      : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOOSoA>
 
     %C1 = call @add_coo_dense(%A, %COO_B) : (tensor<8x8xf32>,
-                                             tensor<8x8xf32, #SortedCOO>)
+                                             tensor<8x8xf32, #SortedCOOSoA>)
                                           -> tensor<8x8xf32>
     %C2 = call @add_coo_csr(%CSR_A, %COO_B) : (tensor<8x8xf32, #CSR>,
-                                               tensor<8x8xf32, #SortedCOO>)
+                                               tensor<8x8xf32, #SortedCOOSoA>)
                                             -> tensor<8x8xf32>
     %C3 = call @add_coo_coo(%COO_A, %COO_B) : (tensor<8x8xf32, #SortedCOO>,
-                                               tensor<8x8xf32, #SortedCOO>)
+                                               tensor<8x8xf32, #SortedCOOSoA>)
                                             -> tensor<8x8xf32>
     %COO_RET = call @add_coo_coo_out_coo(%COO_A, %COO_B) : (tensor<8x8xf32, #SortedCOO>,
-                                                            tensor<8x8xf32, #SortedCOO>)
+                                                            tensor<8x8xf32, #SortedCOOSoA>)
                                                          -> tensor<8x8xf32, #SortedCOO>
     %C4 = sparse_tensor.convert %COO_RET : tensor<8x8xf32, #SortedCOO> to tensor<8x8xf32>
     //
@@ -204,7 +208,7 @@ module {
     bufferization.dealloc_tensor %C4 : tensor<8x8xf32>
     bufferization.dealloc_tensor %CSR_A : tensor<8x8xf32, #CSR>
     bufferization.dealloc_tensor %COO_A : tensor<8x8xf32, #SortedCOO>
-    bufferization.dealloc_tensor %COO_B : tensor<8x8xf32, #SortedCOO>
+    bufferization.dealloc_tensor %COO_B : tensor<8x8xf32, #SortedCOOSoA>
     bufferization.dealloc_tensor %COO_RET : tensor<8x8xf32, #SortedCOO>
 
 
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 31ceb05ad1dbfd..eb8c0aba1d33b6 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1414,7 +1414,7 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
     }
     body.unindent() << "}\n";
     body.unindent();
-  } else if (dyn_cast<PropDictDirective>(element)) {
+  } else if (isa<PropDictDirective>(element)) {
     body << "  if (parseProperties(parser, result))\n"
          << "    return ::mlir::failure();\n";
   } else if (auto *customDir = dyn_cast<CustomDirective>(element)) {
@@ -2239,7 +2239,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
   }
 
   // Emit the attribute dictionary.
-  if (dyn_cast<PropDictDirective>(element)) {
+  if (isa<PropDictDirective>(element)) {
     genPropDictPrinter(*this, op, body);
     lastWasPunctuation = false;
     return;
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index 77c34cb03e987e..426a3482960be4 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -1785,7 +1785,7 @@ void PatternEmitter::createAggregateLocalVarsForOpArgs(
                     range);
       sizes.push_back(formatv("static_cast<int32_t>({0}.size())", range));
     } else {
-      sizes.push_back("1");
+      sizes.emplace_back("1");
       os << formatv("tblgen_values.push_back(");
       if (node.isNestedDagArg(argIndex)) {
         os << symbolInfoMap.getValueAndRangeUse(
diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp
index bb7241c2d51969..a37a2afc226453 100644
--- a/mlir/unittests/Bytecode/BytecodeTest.cpp
+++ b/mlir/unittests/Bytecode/BytecodeTest.cpp
@@ -23,7 +23,7 @@
 using namespace llvm;
 using namespace mlir;
 
-StringLiteral IRWithResources = R"(
+StringLiteral irWithResources = R"(
 module @TestDialectResources attributes {
   bytecode.test = dense_resource<resource> : tensor<4xi32>
 } {}
@@ -42,7 +42,7 @@ TEST(Bytecode, MultiModuleWithResource) {
   Builder builder(&context);
   ParserConfig parseConfig(&context);
   OwningOpRef<Operation *> module =
-      parseSourceString<Operation *>(IRWithResources, parseConfig);
+      parseSourceString<Operation *>(irWithResources, parseConfig);
   ASSERT_TRUE(module);
 
   // Write the module to bytecode
@@ -53,15 +53,15 @@ TEST(Bytecode, MultiModuleWithResource) {
 
   // Create copy of buffer which is aligned to requested resource alignment.
   constexpr size_t kAlignment = 0x20;
-  size_t buffer_size = buffer.size();
-  buffer.reserve(buffer_size + kAlignment - 1);
+  size_t bufferSize = buffer.size();
+  buffer.reserve(bufferSize + kAlignment - 1);
   size_t pad = ~(uintptr_t)buffer.data() + 1 & kAlignment - 1;
   buffer.insert(0, pad, ' ');
-  StringRef aligned_buffer(buffer.data() + pad, buffer_size);
+  StringRef alignedBuffer(buffer.data() + pad, bufferSize);
 
   // Parse it back
   OwningOpRef<Operation *> roundTripModule =
-      parseSourceString<Operation *>(aligned_buffer, parseConfig);
+      parseSourceString<Operation *>(alignedBuffer, parseConfig);
   ASSERT_TRUE(roundTripModule);
 
   // FIXME: Parsing external resources does not work on big-endian
diff --git a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp
index 48c62ad20a04a6..5b48e80749c8b8 100644
--- a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp
+++ b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp
@@ -98,7 +98,7 @@ TEST(FileLineColLocBreakpointManager, OperationMatch) {
   // Set a breakpoint matching only the second operation in the list.
   auto *breakpoint = breakpointManager.addBreakpoint(
       fileNames[0], lineColLoc[0].first, lineColLoc[0].second);
-  auto checkMatchIdxs = [&](DenseSet<int> idxs) {
+  auto checkMatchIdxs = [&](const DenseSet<int> &idxs) {
     counter = 0;
     int reference = 0;
     for (int i = 0; i < (int)operations.size(); ++i) {
diff --git a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
index 56a98cc205ab43..3a6bcbd999a57a 100644
--- a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
+++ b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
@@ -77,7 +77,7 @@ class SerializationTest : public ::testing::Test {
   }
 
   // Inserts an Integer or a Vector of Integers constant of value 'val'.
-  spirv::ConstantOp AddConstInt(Type type, APInt val) {
+  spirv::ConstantOp AddConstInt(Type type, const APInt &val) {
     OpBuilder builder(module->getRegion());
     auto loc = UnknownLoc::get(&context);