From 159f6125b6ef506acdda24cebfafd566bed83df2 Mon Sep 17 00:00:00 2001
From: Keenan Gugeler <kuzu@kgugeler.ca>
Date: Thu, 5 Oct 2023 16:03:55 -0400
Subject: [PATCH] tidy: add clang-tidy

clang-tidy is a static analyzer, and can find many different issues.
So far I have integrated a few simple checks, and disabled the remaining
ones. They will be enabled in future PRs.

Closes #2152. Ref #2148.
---
 .clang-tidy                                   | 16 +++++++++++
 .github/workflows/ci-workflow.yml             | 27 ++++++++++++++-----
 Makefile                                      | 10 +++++++
 dataset/invalid-utf8/invalid-utf8.csv         |  1 +
 src/common/arrow/arrow_converter.cpp          |  2 +-
 src/common/types/date_t.cpp                   |  2 +-
 src/common/types/dtime_t.cpp                  |  2 +-
 src/common/types/types.cpp                    |  4 +--
 src/function/base_lower_upper_operation.cpp   | 14 ++++++++--
 src/include/common/system_message.h           | 22 +++++++++++++++
 src/include/common/types/cast_helpers.h       |  4 +--
 .../arithmetic/arithmetic_functions.h         |  3 ++-
 .../function/boolean/boolean_functions.h      |  8 ++----
 src/include/processor/operator/scan_node_id.h |  8 +-----
 src/main/plan_printer.cpp                     | 11 +++++---
 src/main/query_result.cpp                     |  8 +++---
 .../agg_key_dependency_optimizer.cpp          |  9 ++++---
 .../operator/persistent/copy_rel_columns.cpp  |  1 +
 .../persistent/reader/csv/base_csv_reader.cpp | 20 +++++++-------
 .../reader/csv/parallel_csv_reader.cpp        |  3 ++-
 .../reader/parquet/parquet_reader.cpp         | 10 -------
 .../operator/persistent/reader_functions.cpp  |  9 +++++--
 src/processor/operator/scan_node_id.cpp       |  9 +++++++
 src/processor/result/factorized_table.cpp     |  3 +--
 src/storage/buffer_manager/vm_region.cpp      | 12 ++++++---
 src/storage/stats/node_table_statistics.cpp   |  2 +-
 .../lists/lists_update_store.cpp              |  4 +--
 src/storage/store/column_chunk.cpp            |  4 +--
 test/c_api/query_result_test.cpp              | 16 +++++++++++
 .../exceptions/runtime/invalid_utf8.test      |  9 +++++++
 tools/benchmark/main.cpp                      |  6 ++---
 tools/java_api/src/jni/kuzu_java.cpp          |  7 +++--
 tools/nodejs_api/CMakeLists.txt               | 14 +++++-----
 tools/shell/embedded_shell.cpp                |  2 --
 34 files changed, 189 insertions(+), 93 deletions(-)
 create mode 100644 .clang-tidy
 create mode 100644 dataset/invalid-utf8/invalid-utf8.csv
 create mode 100644 src/include/common/system_message.h
 create mode 100644 test/test_files/exceptions/runtime/invalid_utf8.test

diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000000..f2a3fc43bd
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,16 @@
+# Bugprone exception escape is super unhelpful.
+Checks:
+  -*,
+
+  bugprone-*,
+  -bugprone-easily-swappable-parameters,
+  -bugprone-exception-escape,
+  -bugprone-forward-declaration-namespace,
+  -bugprone-implicit-widening-of-multiplication-result,
+  -bugprone-narrowing-conversions,
+  -bugprone-signed-char-misuse,
+
+  clang-analyzer-deadcode.DeadStores,
+  concurrency-*,
+HeaderFilterRegex: src/include|tools/benchmark/include|tools/nodejs_api/src_cpp/include|tools/python_api/src_cpp/include|tools/rust_api/include|tools/shell/include
+WarningsAsErrors: "*"
diff --git a/.github/workflows/ci-workflow.yml b/.github/workflows/ci-workflow.yml
index 310dc73437..79e62f2479 100644
--- a/.github/workflows/ci-workflow.yml
+++ b/.github/workflows/ci-workflow.yml
@@ -20,7 +20,7 @@ jobs:
         run: |
           pip install torch~=2.0.0 --extra-index-url https://download.pytorch.org/whl/cpu &&\
           pip install --user -r tools/python_api/requirements_dev.txt -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
-      
+
       - name: Ensure Node.js dependencies
         run: npm install --include=dev
         working-directory: tools/nodejs_api
@@ -30,7 +30,7 @@ jobs:
 
       - name: Python test
         run: CC=gcc CXX=g++ make pytest NUM_THREADS=32
-      
+
       - name: Node.js test
         run: CC=gcc CXX=g++ make nodejstest NUM_THREADS=32
 
@@ -117,7 +117,7 @@ jobs:
         shell: bash
 
   clang-build-test:
-    name: clang build & test
+    name: clang build, test & tidy
     needs: [clang-formatting-check, header-include-guard-check]
     runs-on: kuzu-self-hosted-testing
     steps:
@@ -140,13 +140,26 @@ jobs:
 
       - name: Python test
         run: CC=clang-14 CXX=clang++-14 make pytest NUM_THREADS=32
-      
+
       - name: Node.js test
         run: CC=clang-14 CXX=clang++-14 make nodejstest NUM_THREADS=32
-      
+
       - name: Java test
         run: CC=clang-14 CXX=clang++-14 make javatest NUM_THREADS=32
 
+      - name: Create compilation commands database
+        run: make clang-tidy-ci
+
+      - name: Run clang-tidy on src
+        run: run-clang-tidy -p build/release -quiet -j 32 "^$(realpath src)"
+
+      - name: Run clang-tidy on tools
+        run: run-clang-tidy -p build/release -quiet -j 32 "^$(realpath tools)/(?!shell/linenoise.cpp)"
+
+      - name: Run clang-tidy on examples
+        run: run-clang-tidy -p build/release -quiet -j 32 "^$(realpath examples)"
+
+
   msvc-build-test:
     name: msvc build & test
     needs: [clang-formatting-check, header-include-guard-check]
@@ -226,12 +239,12 @@ jobs:
     runs-on: kuzu-self-hosted-benchmarking
     steps:
       - uses: actions/checkout@v3
-      
+
       - name: Ensure Python dependencies
         run: |
           pip install torch~=1.13 --extra-index-url https://download.pytorch.org/whl/cpu &&\
           pip install --user -r tools/python_api/requirements_dev.txt -f https://data.pyg.org/whl/torch-1.13.0+cpu.html
-      
+
       - name: Ensure Node.js dependencies
         run: npm install
         working-directory: tools/nodejs_api
diff --git a/Makefile b/Makefile
index 0ed0f0cfc4..7190461a1a 100644
--- a/Makefile
+++ b/Makefile
@@ -103,6 +103,16 @@ lcov:
 	cd $(ROOT_DIR)/build/release/test && \
 	ctest --output-on-failure -j ${TEST_JOBS}
 
+clangd:
+	$(call mkdirp,build/clangd) && cd build/clangd && \
+	cmake $(GENERATOR) $(FORCE_COLOR) $(SANITIZER_FLAG) -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_BUILD_TYPE=Release -DBUILD_EXAMPLES=TRUE \
+		-DBUILD_BENCHMARK=TRUE -DBUILD_JAVA=TRUE -DBUILD_NODEJS=TRUE -DBUILD_TESTS=TRUE ../..
+
+clang-tidy-ci:
+	# For CI: export compile commands for the built items.
+	$(call mkdirp,build/release) && cd build/release && \
+	cmake $(GENERATOR) $(FORCE_COLOR) $(SANITIZER_FLAG) -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ../..
+
 pytest:
 	$(MAKE) release
 	cd $(ROOT_DIR)/tools/python_api/test && \
diff --git a/dataset/invalid-utf8/invalid-utf8.csv b/dataset/invalid-utf8/invalid-utf8.csv
new file mode 100644
index 0000000000..ce542efaa5
--- /dev/null
+++ b/dataset/invalid-utf8/invalid-utf8.csv
@@ -0,0 +1 @@
+ÿ
\ No newline at end of file
diff --git a/src/common/arrow/arrow_converter.cpp b/src/common/arrow/arrow_converter.cpp
index 1cd7156214..51e827dbfc 100644
--- a/src/common/arrow/arrow_converter.cpp
+++ b/src/common/arrow/arrow_converter.cpp
@@ -44,7 +44,7 @@ void ArrowConverter::setArrowFormatForStruct(
     ArrowSchemaHolder& rootHolder, ArrowSchema& child, const main::DataTypeInfo& typeInfo) {
     auto& childrenTypesInfo = typeInfo.childrenTypesInfo;
     child.format = "+s";
-    child.name = typeInfo.name.c_str();
+    // name is set by parent.
     child.n_children = (std::int64_t)childrenTypesInfo.size();
     rootHolder.nestedChildren.emplace_back();
     rootHolder.nestedChildren.back().resize(child.n_children);
diff --git a/src/common/types/date_t.cpp b/src/common/types/date_t.cpp
index 41acd57674..20368fd367 100644
--- a/src/common/types/date_t.cpp
+++ b/src/common/types/date_t.cpp
@@ -274,7 +274,7 @@ bool Date::tryConvertDate(const char* buf, uint64_t len, uint64_t& pos, date_t&
     int32_t day = 0;
     int32_t month = -1;
     int32_t year = 0;
-    int sep;
+    char sep;
 
     // skip leading spaces
     while (pos < len && StringUtils::CharacterIsSpace(buf[pos])) {
diff --git a/src/common/types/dtime_t.cpp b/src/common/types/dtime_t.cpp
index 1eb7f1422d..41b512f117 100644
--- a/src/common/types/dtime_t.cpp
+++ b/src/common/types/dtime_t.cpp
@@ -62,7 +62,7 @@ bool Time::TryConvertTime(const char* buf, uint64_t len, uint64_t& pos, dtime_t&
         return false;
     }
 
-    int sep;
+    char sep;
 
     // skip leading spaces
     while (pos < len && isspace(buf[pos])) {
diff --git a/src/common/types/types.cpp b/src/common/types/types.cpp
index 5f96367dd4..6bab4e47a9 100644
--- a/src/common/types/types.cpp
+++ b/src/common/types/types.cpp
@@ -448,12 +448,10 @@ LogicalTypeID LogicalTypeUtils::dataTypeIDFromString(const std::string& dataType
         return LogicalTypeID::INTERNAL_ID;
     } else if ("INT64" == upperDataTypeIDString) {
         return LogicalTypeID::INT64;
-    } else if ("INT32" == upperDataTypeIDString) {
+    } else if ("INT32" == upperDataTypeIDString || "INT" == upperDataTypeIDString) {
         return LogicalTypeID::INT32;
     } else if ("INT16" == upperDataTypeIDString) {
         return LogicalTypeID::INT16;
-    } else if ("INT" == upperDataTypeIDString) {
-        return LogicalTypeID::INT32;
     } else if ("INT8" == upperDataTypeIDString) {
         return LogicalTypeID::INT8;
     } else if ("UINT64" == upperDataTypeIDString) {
diff --git a/src/function/base_lower_upper_operation.cpp b/src/function/base_lower_upper_operation.cpp
index ade1725904..5f1f8c48a4 100644
--- a/src/function/base_lower_upper_operation.cpp
+++ b/src/function/base_lower_upper_operation.cpp
@@ -1,5 +1,9 @@
+#include "common/assert.h"
+#include "common/exception/runtime.h"
+#include "common/string_utils.h"
 #include "function/string/functions/base_lower_upper_function.h"
 
+using namespace kuzu::common;
 using namespace kuzu::utf8proc;
 
 namespace kuzu {
@@ -13,6 +17,12 @@ uint32_t BaseLowerUpperFunction::getResultLen(char* inputStr, uint32_t inputLen,
         if (inputStr[i] & 0x80) {
             int size = 0;
             int codepoint = utf8proc_codepoint(inputStr + i, size);
+            if (codepoint < 0) {
+                // TODO(Xiyang): We shouldn't allow invalid UTF-8 to enter a string column.
+                std::string funcName = isUpper ? "UPPER" : "LOWER";
+                throw RuntimeException(common::StringUtils::string_format(
+                    "Failed calling {}: Invalid UTF-8.", funcName));
+            }
             int convertedCodepoint =
                 isUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
             int newSize = utf8proc_codepoint_length(convertedCodepoint);
@@ -32,10 +42,10 @@ void BaseLowerUpperFunction::convertCase(char* result, uint32_t len, char* input
         if (input[i] & 0x80) {
             int size = 0, newSize = 0;
             int codepoint = utf8proc_codepoint(input + i, size);
+            assert(codepoint >= 0); // Validity ensured by getResultLen.
             int convertedCodepoint =
                 toUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
-            auto success = utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result);
-            assert(success);
+            utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result);
             result += newSize;
             i += size;
         } else {
diff --git a/src/include/common/system_message.h b/src/include/common/system_message.h
new file mode 100644
index 0000000000..0ed6e2e30d
--- /dev/null
+++ b/src/include/common/system_message.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cerrno>
+#include <string>
+#include <system_error>
+
+namespace kuzu {
+namespace common {
+inline std::string systemErrMessage(int code) {
+    // System errors are unexpected. For anything expected, we should catch it explicitly and
+    // provide a better error message to the user.
+    // LCOV_EXCL_START
+    return std::system_category().message(code);
+    // LCOV_EXCL_END
+}
+inline std::string posixErrMessage() {
+    // LCOV_EXCL_START
+    return systemErrMessage(errno);
+    // LCOV_EXCL_END
+}
+} // namespace common
+} // namespace kuzu
diff --git a/src/include/common/types/cast_helpers.h b/src/include/common/types/cast_helpers.h
index 2a313348ab..442a9126c2 100644
--- a/src/include/common/types/cast_helpers.h
+++ b/src/include/common/types/cast_helpers.h
@@ -122,7 +122,7 @@ struct DateToStringCast {
         }
         // optionally add BC to the end of the date
         if (add_bc) {
-            memcpy(ptr, " (BC)", 5);
+            strcpy(ptr, " (BC)");
         }
     }
 };
@@ -282,7 +282,7 @@ struct IntervalToStringCast {
             }
         } else if (length == 0) {
             // empty interval: default to 00:00:00
-            memcpy(buffer, "00:00:00", 8);
+            strcpy(buffer, "00:00:00");
             return 8;
         }
         return length;
diff --git a/src/include/function/arithmetic/arithmetic_functions.h b/src/include/function/arithmetic/arithmetic_functions.h
index 8456536c6a..375628b40a 100644
--- a/src/include/function/arithmetic/arithmetic_functions.h
+++ b/src/include/function/arithmetic/arithmetic_functions.h
@@ -206,7 +206,8 @@ struct Gamma {
 struct Lgamma {
     template<class T>
     static inline void operation(T& input, double_t& result) {
-        result = lgamma(input);
+        result =
+            lgamma(input); // NOLINT(concurrency-mt-unsafe): We don't use the thread-unsafe signgam.
     }
 };
 
diff --git a/src/include/function/boolean/boolean_functions.h b/src/include/function/boolean/boolean_functions.h
index 8147bf535d..f90a8b5743 100644
--- a/src/include/function/boolean/boolean_functions.h
+++ b/src/include/function/boolean/boolean_functions.h
@@ -38,9 +38,7 @@ const uint8_t NULL_BOOL = 2;
 struct And {
     static inline void operation(
         bool left, bool right, uint8_t& result, bool isLeftNull, bool isRightNull) {
-        if (!left && !isLeftNull) {
-            result = false;
-        } else if (!right && !isRightNull) {
+        if ((!left && !isLeftNull) || (!right && !isRightNull)) {
             result = false;
         } else if (isLeftNull || isRightNull) {
             result = NULL_BOOL;
@@ -68,9 +66,7 @@ struct And {
 struct Or {
     static inline void operation(
         bool left, bool right, uint8_t& result, bool isLeftNull, bool isRightNull) {
-        if (left && !isLeftNull) {
-            result = true;
-        } else if (right && !isRightNull) {
+        if ((left && !isLeftNull) || (right && !isRightNull)) {
             result = true;
         } else if (isLeftNull || isRightNull) {
             result = NULL_BOOL;
diff --git a/src/include/processor/operator/scan_node_id.h b/src/include/processor/operator/scan_node_id.h
index 95a43afb05..2b586e24a9 100644
--- a/src/include/processor/operator/scan_node_id.h
+++ b/src/include/processor/operator/scan_node_id.h
@@ -43,13 +43,7 @@ class ScanNodeIDSharedState {
     inline uint32_t getNumTableStates() const { return tableStates.size(); }
     inline NodeTableScanState* getTableState(uint32_t idx) const { return tableStates[idx].get(); }
 
-    inline void initialize(transaction::Transaction* transaction) {
-        auto numMask = tableStates[0]->getSemiMask()->getNumMasks();
-        for (auto& tableState : tableStates) {
-            assert(tableState->getSemiMask()->getNumMasks() == numMask);
-            tableState->initializeMaxOffset(transaction);
-        }
-    }
+    void initialize(transaction::Transaction* transaction);
 
     std::tuple<NodeTableScanState*, common::offset_t, common::offset_t> getNextRangeToRead();
 
diff --git a/src/main/plan_printer.cpp b/src/main/plan_printer.cpp
index 8290ffe9fc..3710de1470 100644
--- a/src/main/plan_printer.cpp
+++ b/src/main/plan_printer.cpp
@@ -150,6 +150,10 @@ void OpProfileTree::printOpProfileBoxUpperFrame(uint32_t rowIdx, std::ostringstr
     oss << std::endl;
 }
 
+static std::string dashedLineAccountingForIndex(uint32_t width, uint32_t indent) {
+    return std::string(width - (1 + indent) * 2, '-');
+}
+
 void OpProfileTree::printOpProfileBoxes(uint32_t rowIdx, std::ostringstream& oss) const {
     auto height = calculateRowHeight(rowIdx);
     auto halfWayPoint = height / 2;
@@ -164,12 +168,13 @@ void OpProfileTree::printOpProfileBoxes(uint32_t rowIdx, std::ostringstream& oss
                 unsigned int numParams = opProfileBox->getNumParams();
                 if (i == 0) {
                     textToPrint = opProfileBox->getOpName();
-                } else if (i == 1) {
-                    textToPrint = std::string(opProfileBoxWidth - (1 + INDENT_WIDTH) * 2, '-');
+                } else if (i == 1) { // NOLINT(bugprone-branch-clone): Merging these branches is a
+                                     // logical error, and this conditional chain is pleasant.
+                    textToPrint = dashedLineAccountingForIndex(opProfileBoxWidth, INDENT_WIDTH);
                 } else if (i <= numParams + 1) {
                     textToPrint = opProfileBox->getParamsName(i - 2);
                 } else if ((i - numParams - 1) % 2) {
-                    textToPrint = std::string(opProfileBoxWidth - (1 + INDENT_WIDTH) * 2, '-');
+                    textToPrint = dashedLineAccountingForIndex(opProfileBoxWidth, INDENT_WIDTH);
                 } else {
                     textToPrint = opProfileBox->getAttribute((i - numParams - 1) / 2 - 1);
                 }
diff --git a/src/main/query_result.cpp b/src/main/query_result.cpp
index 4d96ef3607..8a3f08ab4e 100644
--- a/src/main/query_result.cpp
+++ b/src/main/query_result.cpp
@@ -91,7 +91,7 @@ std::vector<std::unique_ptr<DataTypeInfo>> QueryResult::getColumnTypesInfo() con
             columnTypeInfo->childrenTypesInfo.push_back(
                 DataTypeInfo::getInfoForDataType(LogicalType(LogicalTypeID::STRING), "_label"));
             auto numProperties = NodeVal::getNumProperties(value);
-            for (auto j = 0u; i < numProperties; j++) {
+            for (auto j = 0u; j < numProperties; j++) {
                 auto name = NodeVal::getPropertyName(value, j);
                 auto val = NodeVal::getPropertyVal(value, j);
                 columnTypeInfo->childrenTypesInfo.push_back(
@@ -104,9 +104,9 @@ std::vector<std::unique_ptr<DataTypeInfo>> QueryResult::getColumnTypesInfo() con
             columnTypeInfo->childrenTypesInfo.push_back(
                 DataTypeInfo::getInfoForDataType(LogicalType(LogicalTypeID::INTERNAL_ID), "_dst"));
             auto numProperties = RelVal::getNumProperties(value);
-            for (auto j = 0u; i < numProperties; j++) {
-                auto name = NodeVal::getPropertyName(value, j);
-                auto val = NodeVal::getPropertyVal(value, j);
+            for (auto j = 0u; j < numProperties; j++) {
+                auto name = RelVal::getPropertyName(value, j);
+                auto val = RelVal::getPropertyVal(value, j);
                 columnTypeInfo->childrenTypesInfo.push_back(
                     DataTypeInfo::getInfoForDataType(*val->getDataType(), name));
             }
diff --git a/src/optimizer/agg_key_dependency_optimizer.cpp b/src/optimizer/agg_key_dependency_optimizer.cpp
index eae657dffd..64d1394868 100644
--- a/src/optimizer/agg_key_dependency_optimizer.cpp
+++ b/src/optimizer/agg_key_dependency_optimizer.cpp
@@ -67,16 +67,17 @@ AggKeyDependencyOptimizer::resolveKeysAndDependentKeys(const binder::expression_
     for (auto& expression : keys) {
         if (expression->expressionType == PROPERTY) {
             auto propertyExpression = (binder::PropertyExpression*)expression.get();
-            if (propertyExpression->isPrimaryKey() || propertyExpression->isInternalID()) {
+            if (propertyExpression->isPrimaryKey() ||
+                propertyExpression->isInternalID()) { // NOLINT(bugprone-branch-clone): Collapsing
+                                                      // is a logical error.
                 groupExpressions.push_back(expression);
             } else if (primaryVarNames.contains(propertyExpression->getVariableName())) {
                 dependentExpressions.push_back(expression);
             } else {
                 groupExpressions.push_back(expression);
             }
-        } else if (ExpressionUtil::isNodeVariable(*expression)) {
-            dependentExpressions.push_back(expression);
-        } else if (ExpressionUtil::isRelVariable(*expression)) {
+        } else if (ExpressionUtil::isNodeVariable(*expression) ||
+                   ExpressionUtil::isRelVariable(*expression)) {
             dependentExpressions.push_back(expression);
         } else {
             groupExpressions.push_back(expression);
diff --git a/src/processor/operator/persistent/copy_rel_columns.cpp b/src/processor/operator/persistent/copy_rel_columns.cpp
index daa7c8b42b..d5ab32fd8e 100644
--- a/src/processor/operator/persistent/copy_rel_columns.cpp
+++ b/src/processor/operator/persistent/copy_rel_columns.cpp
@@ -26,6 +26,7 @@ void CopyRelColumns::executeInternal(ExecutionContext* context) {
             numBwdRows = countRelLists(sharedState->bwdRelData.get(), info.nbrOffsetPos);
         }
         assert(numFwdRows == numBwdRows);
+        (void)numBwdRows; // For clang-tidy, value calculated for debug assert.
         numRows += numFwdRows;
     }
     sharedState->incrementNumRows(numRows);
diff --git a/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp b/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp
index f1953d0576..4273731d20 100644
--- a/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp
+++ b/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp
@@ -5,6 +5,7 @@
 #include "common/data_chunk/data_chunk.h"
 #include "common/exception/copy.h"
 #include "common/string_utils.h"
+#include "common/system_message.h"
 #include "processor/operator/persistent/reader/csv/driver.h"
 
 using namespace kuzu::common;
@@ -23,8 +24,10 @@ BaseCSVReader::BaseCSVReader(const std::string& filePath, const common::ReaderCo
 #endif
     );
     if (fd == -1) {
+        // LCOV_EXCL_START
         throw CopyException(
-            StringUtils::string_format("Could not open file {}: {}", filePath, strerror(errno)));
+            StringUtils::string_format("Could not open file {}: {}", filePath, posixErrMessage()));
+        // LCOV_EXCL_END
     }
 }
 
@@ -195,8 +198,10 @@ bool BaseCSVReader::readBuffer(uint64_t* start) {
 
     uint64_t readCount = read(fd, buffer.get() + remaining, bufferReadSize);
     if (readCount == -1) {
+        // LCOV_EXCL_START
         throw CopyException(StringUtils::string_format(
-            "Could not read from file {}: {}", filePath, strerror(errno)));
+            "Could not read from file {}: {}", filePath, posixErrMessage()));
+        // LCOV_EXCL_END
     }
 
     bufferSize = remaining + readCount;
@@ -414,7 +419,7 @@ uint64_t BaseCSVReader::getFileOffset() const {
     if (offset == -1) {
         // LCOV_EXCL_START
         throw CopyException(StringUtils::string_format(
-            "Could not get current file position for file {}: {}", filePath, strerror(errno)));
+            "Could not get current file position for file {}: {}", filePath, posixErrMessage()));
         // LCOV_EXCL_END
     }
     assert(offset >= bufferSize);
@@ -429,7 +434,7 @@ uint64_t BaseCSVReader::getLineNumber() {
     if (lseek(fd, 0, SEEK_SET) == -1) {
         // LCOV_EXCL_START
         throw CopyException(StringUtils::string_format(
-            "Could not seek to beginning of file {}: {}", filePath, strerror(errno)));
+            "Could not seek to beginning of file {}: {}", filePath, posixErrMessage()));
         // LCOV_EXCL_END
     }
 
@@ -440,16 +445,13 @@ uint64_t BaseCSVReader::getLineNumber() {
         if (bytesRead == -1) {
             // LCOV_EXCL_START
             throw CopyException(StringUtils::string_format(
-                "Could not read from file {}: {}", filePath, strerror(errno)));
+                "Could not read from file {}: {}", filePath, posixErrMessage()));
             // LCOV_EXCL_END
         }
         totalBytes += bytesRead;
 
         for (uint64_t i = 0; i < bytesRead; i++) {
-            if (buf[i] == '\n') {
-                lineNumber++;
-                carriageReturn = false;
-            } else if (carriageReturn) {
+            if (buf[i] == '\n' || carriageReturn) {
                 lineNumber++;
                 carriageReturn = false;
             }
diff --git a/src/processor/operator/persistent/reader/csv/parallel_csv_reader.cpp b/src/processor/operator/persistent/reader/csv/parallel_csv_reader.cpp
index baf64fbb3a..7cd1434648 100644
--- a/src/processor/operator/persistent/reader/csv/parallel_csv_reader.cpp
+++ b/src/processor/operator/persistent/reader/csv/parallel_csv_reader.cpp
@@ -2,6 +2,7 @@
 
 #include "common/exception/copy.h"
 #include "common/string_utils.h"
+#include "common/system_message.h"
 #include "processor/operator/persistent/reader/csv/driver.h"
 
 using namespace kuzu::common;
@@ -46,7 +47,7 @@ void ParallelCSVReader::seekToBlockStart() {
     if (lseek(fd, currentBlockIdx * CopyConstants::PARALLEL_BLOCK_SIZE, SEEK_SET) == -1) {
         // LCOV_EXCL_START
         throw CopyException(StringUtils::string_format("Failed to seek to block {} in file {}: {}",
-            currentBlockIdx, filePath, strerror(errno)));
+            currentBlockIdx, filePath, posixErrMessage()));
         // LCOV_EXCL_END
     }
 
diff --git a/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp b/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp
index 6655bf934e..604f93ba86 100644
--- a/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp
+++ b/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp
@@ -243,16 +243,6 @@ std::unique_ptr<ColumnReader> ParquetReader::createReaderRecursive(uint64_t dept
         bool isRepeated = repetition_type == FieldRepetitionType::REPEATED;
         bool isList = sEle.__isset.converted_type && sEle.converted_type == ConvertedType::LIST;
         bool isMap = sEle.__isset.converted_type && sEle.converted_type == ConvertedType::MAP;
-        bool isMapKv =
-            sEle.__isset.converted_type && sEle.converted_type == ConvertedType::MAP_KEY_VALUE;
-        if (!isMapKv && thisIdx > 0) {
-            // check if the parent node of this is a map
-            auto& pEle = metadata->schema[thisIdx - 1];
-            bool parentIsMap =
-                pEle.__isset.converted_type && pEle.converted_type == ConvertedType::MAP;
-            bool parentHasChildren = pEle.__isset.num_children && pEle.num_children == 1;
-            isMapKv = parentIsMap && parentHasChildren;
-        }
         if (structFields.size() > 1 || (!isList && !isMap && !isRepeated)) {
             resultType = std::make_unique<common::LogicalType>(common::LogicalTypeID::STRUCT,
                 std::make_unique<common::StructTypeInfo>(std::move(structFields)));
diff --git a/src/processor/operator/persistent/reader_functions.cpp b/src/processor/operator/persistent/reader_functions.cpp
index c95aca4063..9c0a7797b2 100644
--- a/src/processor/operator/persistent/reader_functions.cpp
+++ b/src/processor/operator/persistent/reader_functions.cpp
@@ -1,6 +1,7 @@
 #include "processor/operator/persistent/reader_functions.h"
 
 #include "common/exception/copy.h"
+#include "common/system_message.h"
 #include <arrow/table.h>
 
 using namespace kuzu::common;
@@ -202,14 +203,18 @@ std::vector<FileBlocksInfo> ReaderFunctions::countRowsInParallelCSVFile(
     for (const auto& path : config.filePaths) {
         int fd = open(path.c_str(), O_RDONLY);
         if (fd == -1) {
+            // LCOV_EXCL_START
             throw CopyException(
-                StringUtils::string_format("Failed to open file {}: {}", path, strerror(errno)));
+                StringUtils::string_format("Failed to open file {}: {}", path, posixErrMessage()));
+            // LCOV_EXCL_END
         }
         uint64_t length = lseek(fd, 0, SEEK_END);
         close(fd);
         if (length == -1) {
+            // LCOV_EXCL_START
             throw CopyException(StringUtils::string_format(
-                "Failed to seek to end of file {}: {}", path, strerror(errno)));
+                "Failed to seek to end of file {}: {}", path, posixErrMessage()));
+            // LCOV_EXCL_END
         }
         auto reader = make_unique<ParallelCSVReader>(path, config);
         row_idx_t numRowsInFile = reader->countRows();
diff --git a/src/processor/operator/scan_node_id.cpp b/src/processor/operator/scan_node_id.cpp
index 2888319c34..569c3e65a7 100644
--- a/src/processor/operator/scan_node_id.cpp
+++ b/src/processor/operator/scan_node_id.cpp
@@ -24,6 +24,15 @@ std::pair<offset_t, offset_t> NodeTableScanState::getNextRangeToRead() {
     return std::make_pair(startOffset, startOffset + range);
 }
 
+void ScanNodeIDSharedState::initialize(transaction::Transaction* transaction) {
+    auto numMask = tableStates[0]->getSemiMask()->getNumMasks();
+    for (auto& tableState : tableStates) {
+        assert(tableState->getSemiMask()->getNumMasks() == numMask);
+        tableState->initializeMaxOffset(transaction);
+    }
+    (void)numMask; // For clang-tidy: used for assert.
+}
+
 std::tuple<NodeTableScanState*, offset_t, offset_t> ScanNodeIDSharedState::getNextRangeToRead() {
     std::unique_lock lck{mtx};
     if (currentStateIdx == tableStates.size()) {
diff --git a/src/processor/result/factorized_table.cpp b/src/processor/result/factorized_table.cpp
index 5f17924790..c7ba029925 100644
--- a/src/processor/result/factorized_table.cpp
+++ b/src/processor/result/factorized_table.cpp
@@ -287,8 +287,7 @@ void FactorizedTable::copyToInMemList(ft_col_idx_t colIdx,
     std::vector<ft_tuple_idx_t>& tupleIdxesToRead, uint8_t* data, NullMask* nullMask,
     uint64_t startElemPosInList, DiskOverflowFile* overflowFileOfInMemList,
     const LogicalType& type) const {
-    auto column = tableSchema->getColumn(colIdx);
-    assert(column->isFlat() == true);
+    assert(tableSchema->getColumn(colIdx)->isFlat() == true);
     auto numBytesPerValue = type.getLogicalTypeID() == LogicalTypeID::INTERNAL_ID ?
                                 sizeof(offset_t) :
                                 LogicalTypeUtils::getRowLayoutSize(type);
diff --git a/src/storage/buffer_manager/vm_region.cpp b/src/storage/buffer_manager/vm_region.cpp
index 5bda4c80f2..84e2d43e26 100644
--- a/src/storage/buffer_manager/vm_region.cpp
+++ b/src/storage/buffer_manager/vm_region.cpp
@@ -1,6 +1,7 @@
 #include "storage/buffer_manager/vm_region.h"
 
 #include "common/string_utils.h"
+#include "common/system_message.h"
 
 #ifdef _WIN32
 #include <errhandlingapi.h>
@@ -71,17 +72,20 @@ void VMRegion::releaseFrame(frame_idx_t frameIdx) {
     // See https://arvid.io/2018/04/02/memory-mapping-on-windows/#1
     // Not sure what the differences are
     if (!VirtualFree(getFrame(frameIdx), frameSize, MEM_DECOMMIT)) {
+        auto code = GetLastError();
         throw BufferManagerException(StringUtils::string_format(
             "Releasing physical memory associated with a frame failed with error code {}: {}.",
-            GetLastError(), std::system_category().message(GetLastError())));
+            code, systemErrMessage(code)));
     }
 
 #else
     int error = madvise(getFrame(frameIdx), frameSize, MADV_DONTNEED);
     if (error != 0) {
-        throw BufferManagerException(
-            "Releasing physical memory associated with a frame failed with error code " +
-            std::to_string(error) + ": " + std::string(std::strerror(errno)));
+        // LCOV_EXCL_START
+        throw BufferManagerException(StringUtils::string_format(
+            "Releasing physical memory associated with a frame failed with error code {}: {}.",
+            error, posixErrMessage()));
+        // LCOV_EXCL_END
     }
 #endif
 }
diff --git a/src/storage/stats/node_table_statistics.cpp b/src/storage/stats/node_table_statistics.cpp
index ab48dc2532..a285c6588f 100644
--- a/src/storage/stats/node_table_statistics.cpp
+++ b/src/storage/stats/node_table_statistics.cpp
@@ -113,7 +113,7 @@ void NodeTableStatsAndDeletedIDs::setDeletedNodeOffsetsForMorsel(
         auto itr = deletedNodeOffsets.begin();
         sel_t nextDeletedNodeOffset = *itr - morselBeginOffset;
         uint64_t nextSelectedPosition = 0;
-        for (sel_t pos = 0; pos < nodeOffsetVector->state->getOriginalSize(); ++pos) {
+        for (auto pos = 0; pos < nodeOffsetVector->state->getOriginalSize(); ++pos) {
             if (pos == nextDeletedNodeOffset) {
                 itr++;
                 if (itr == deletedNodeOffsets.end()) {
diff --git a/src/storage/storage_structure/lists/lists_update_store.cpp b/src/storage/storage_structure/lists/lists_update_store.cpp
index 82f04fdaad..217659e4f5 100644
--- a/src/storage/storage_structure/lists/lists_update_store.cpp
+++ b/src/storage/storage_structure/lists/lists_update_store.cpp
@@ -322,9 +322,7 @@ ft_col_idx_t ListsUpdatesStore::getColIdxInFT(ListFileID& listFileID) const {
 
 void ListsUpdatesStore::initListsUpdatesPerTablePerDirection() {
     listsUpdatesPerDirection.clear();
-    for (auto direction : RelDataDirectionUtils::getRelDataDirections()) {
-        listsUpdatesPerDirection.emplace_back();
-    }
+    listsUpdatesPerDirection.resize(RelDataDirectionUtils::getRelDataDirections().size());
 }
 
 ListsUpdatesForNodeOffset* ListsUpdatesStore::getOrCreateListsUpdatesForNodeOffset(
diff --git a/src/storage/store/column_chunk.cpp b/src/storage/store/column_chunk.cpp
index 5931535f1e..22f5ba9f5a 100644
--- a/src/storage/store/column_chunk.cpp
+++ b/src/storage/store/column_chunk.cpp
@@ -445,12 +445,10 @@ uint32_t ColumnChunk::getDataTypeSizeInChunk(LogicalType& dataType) {
     case LogicalTypeID::STRING: {
         return sizeof(ku_string_t);
     }
+    case LogicalTypeID::INTERNAL_ID:
     case LogicalTypeID::VAR_LIST: {
         return sizeof(offset_t);
     }
-    case LogicalTypeID::INTERNAL_ID: {
-        return sizeof(offset_t);
-    }
     case LogicalTypeID::SERIAL: {
         return sizeof(int64_t);
     }
diff --git a/test/c_api/query_result_test.cpp b/test/c_api/query_result_test.cpp
index e172571b42..f45d760ba0 100644
--- a/test/c_api/query_result_test.cpp
+++ b/test/c_api/query_result_test.cpp
@@ -80,6 +80,22 @@ TEST_F(CApiQueryResultTest, GetColumnDataType) {
     kuzu_query_result_destroy(result);
 }
 
+TEST_F(CApiQueryResultTest, GetArrowSchema) {
+    auto connection = getConnection();
+    auto result = kuzu_connection_query(connection, "MATCH (p:person)-[k:knows]-(q:person) RETURN "
+                                                    "p.fName, k, q.fName");
+    ASSERT_TRUE(kuzu_query_result_is_success(result));
+    auto schema = kuzu_query_result_get_arrow_schema(result);
+    ASSERT_STREQ(schema.name, "kuzu_query_result");
+    ASSERT_EQ(schema.n_children, 3);
+    ASSERT_STREQ(schema.children[0]->name, "p.fName");
+    ASSERT_STREQ(schema.children[1]->name, "k");
+    ASSERT_STREQ(schema.children[2]->name, "q.fName");
+
+    schema.release(&schema);
+    kuzu_query_result_destroy(result);
+}
+
 TEST_F(CApiQueryResultTest, GetQuerySummary) {
     auto connection = getConnection();
     auto result =
diff --git a/test/test_files/exceptions/runtime/invalid_utf8.test b/test/test_files/exceptions/runtime/invalid_utf8.test
new file mode 100644
index 0000000000..30f9f579da
--- /dev/null
+++ b/test/test_files/exceptions/runtime/invalid_utf8.test
@@ -0,0 +1,9 @@
+-GROUP RuntimeErrorTest
+-DATASET CSV empty
+
+--
+
+-CASE InvalidUTF8
+-STATEMENT LOAD FROM "${KUZU_ROOT_DIRECTORY}/dataset/invalid-utf8/invalid-utf8.csv" RETURN UPPER(column0);
+---- error
+Runtime exception: Failed calling UPPER: Invalid UTF-8.
diff --git a/tools/benchmark/main.cpp b/tools/benchmark/main.cpp
index db940e105e..48264377f4 100644
--- a/tools/benchmark/main.cpp
+++ b/tools/benchmark/main.cpp
@@ -38,16 +38,16 @@ int main(int argc, char** argv) {
             config->bufferPoolSize = (uint64_t)stoull(getArgumentValue(arg)) << 20;
         } else {
             printf("Unrecognized option %s", arg.c_str());
-            exit(1);
+            return 1;
         }
     }
     if (datasetPath.empty()) {
         printf("Missing --dataset input.");
-        exit(1);
+        return 1;
     }
     if (benchmarkPath.empty()) {
         printf("Missing --benchmark input");
-        exit(1);
+        return 1;
     }
     auto runner = BenchmarkRunner(datasetPath, std::move(config));
     try {
diff --git a/tools/java_api/src/jni/kuzu_java.cpp b/tools/java_api/src/jni/kuzu_java.cpp
index d234cf906a..4bd6687fce 100644
--- a/tools/java_api/src/jni/kuzu_java.cpp
+++ b/tools/java_api/src/jni/kuzu_java.cpp
@@ -6,10 +6,10 @@
 #include "com_kuzudb_KuzuNative.h"
 #include "common/exception/exception.h"
 #include "common/types/types.h"
-#include "common/types/value/value.h"
 #include "common/types/value/nested.h"
 #include "common/types/value/node.h"
 #include "common/types/value/rel.h"
+#include "common/types/value/value.h"
 #include "json.hpp"
 #include "main/kuzu.h"
 #include "main/query_summary.h"
@@ -152,8 +152,8 @@ void javaMapToCPPMap(
  * All Database native functions
  */
 
-JNIEXPORT jlong JNICALL Java_com_kuzudb_KuzuNative_kuzu_1database_1init(
-    JNIEnv* env, jclass, jstring database_path, jlong buffer_pool_size, jboolean enable_compression) {
+JNIEXPORT jlong JNICALL Java_com_kuzudb_KuzuNative_kuzu_1database_1init(JNIEnv* env, jclass,
+    jstring database_path, jlong buffer_pool_size, jboolean enable_compression) {
 
     const char* path = env->GetStringUTFChars(database_path, JNI_FALSE);
     uint64_t buffer = static_cast<uint64_t>(buffer_pool_size);
@@ -887,7 +887,6 @@ JNIEXPORT jobject JNICALL Java_com_kuzudb_KuzuNative_kuzu_1value_1get_1value(
         return ret;
     }
     case LogicalTypeID::DATE: {
-        jclass retClass = env->FindClass("java/time/LocalDate");
         date_t date = v->getValue<date_t>();
         jclass ldClass = env->FindClass("java/time/LocalDate");
         jmethodID ofEpochDay =
diff --git a/tools/nodejs_api/CMakeLists.txt b/tools/nodejs_api/CMakeLists.txt
index 55421c56a7..04fbd82692 100644
--- a/tools/nodejs_api/CMakeLists.txt
+++ b/tools/nodejs_api/CMakeLists.txt
@@ -15,18 +15,18 @@ else()
 endif()
 
 execute_process(
-  COMMAND ${NPX_CMD} cmake-js print-cmakejs-include 
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 
+  COMMAND ${NPX_CMD} cmake-js print-cmakejs-include
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   OUTPUT_VARIABLE CMAKE_JS_INC
 )
 execute_process(
-  COMMAND ${NPX_CMD} cmake-js print-cmakejs-lib 
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 
+  COMMAND ${NPX_CMD} cmake-js print-cmakejs-lib
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   OUTPUT_VARIABLE CMAKE_JS_LIB
 )
 execute_process(
   COMMAND ${NPX_CMD} cmake-js print-cmakejs-src
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   OUTPUT_VARIABLE CMAKE_JS_SRC
 )
 
@@ -48,10 +48,10 @@ file(COPY ${JS_SOURCE_FILES} DESTINATION  "${CMAKE_CURRENT_SOURCE_DIR}/build")
 
 add_library(${PROJECT_NAME} SHARED ${CPP_SOURCE_FILES} ${CMAKE_JS_SRC})
 set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
-set_target_properties(${PROJECT_NAME} 
+set_target_properties(${PROJECT_NAME}
   PROPERTIES
   RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build"
-  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build" 
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build"
   ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")
 if(APPLE)
   target_link_options(${PROJECT_NAME} PRIVATE -undefined dynamic_lookup)
diff --git a/tools/shell/embedded_shell.cpp b/tools/shell/embedded_shell.cpp
index 008000c78c..b6babf8064 100644
--- a/tools/shell/embedded_shell.cpp
+++ b/tools/shell/embedded_shell.cpp
@@ -162,7 +162,6 @@ void highlight(char* buffer, char* resultBuf, uint32_t maxLen, uint32_t cursorPo
         }
         lineLen -= thisChar;
         buf = buf.substr(thisChar, lineLen);
-        bufLen = buf.length();
     } else if (buf.length() > maxLen) {
         uint32_t counter = 0;
         uint32_t lineLen = 0;
@@ -171,7 +170,6 @@ void highlight(char* buffer, char* resultBuf, uint32_t maxLen, uint32_t cursorPo
             lineLen = utf8proc_next_grapheme(buffer, bufLen, lineLen);
         }
         buf = buf.substr(0, lineLen);
-        bufLen = buf.length();
     }
     for (auto i = 0u; i < buf.length(); i++) {
         if ((buf[i] != ' ' && !word.empty() && word[0] == ' ') ||