diff --git a/src/evaluate.cpp b/src/evaluate.cpp index dcbfedb499a..9b2af02d2ea 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -25,12 +25,14 @@ #include #include #include +#include #include "nnue/network.h" #include "nnue/nnue_misc.h" #include "position.h" #include "types.h" #include "uci.h" +#include "nnue/nnue_accumulator.h" namespace Stockfish { @@ -45,7 +47,10 @@ int Eval::simple_eval(const Position& pos, Color c) { // Evaluate is the evaluator for the outer world. It returns a static evaluation // of the position from the point of view of the side to move. -Value Eval::evaluate(const Eval::NNUE::Networks& networks, const Position& pos, int optimism) { +Value Eval::evaluate(const Eval::NNUE::Networks& networks, + const Position& pos, + Eval::NNUE::AccumulatorCaches& caches, + int optimism) { assert(!pos.checkers()); @@ -55,8 +60,8 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, const Position& pos, int nnueComplexity; int v; - Value nnue = smallNet ? networks.small.evaluate(pos, true, &nnueComplexity, psqtOnly) - : networks.big.evaluate(pos, true, &nnueComplexity, false); + Value nnue = smallNet ? networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly) + : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false); const auto adjustEval = [&](int optDiv, int nnueDiv, int pawnCountConstant, int pawnCountMul, int npmConstant, int evalDiv, int shufflingConstant, @@ -94,20 +99,22 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, const Position& pos, // Trace scores are from white's point of view std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) { + auto caches = std::make_unique(); + if (pos.checkers()) return "Final evaluation: none (in check)"; std::stringstream ss; ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2); - ss << '\n' << NNUE::trace(pos, networks) << '\n'; + ss << '\n' << NNUE::trace(pos, networks, *caches) << '\n'; ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15); - Value v = networks.big.evaluate(pos, false); + Value v = networks.big.evaluate(pos, &caches->big, false); v = pos.side_to_move() == WHITE ? v : -v; ss << "NNUE evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n"; - v = evaluate(networks, pos, VALUE_ZERO); + v = evaluate(networks, pos, *caches, VALUE_ZERO); v = pos.side_to_move() == WHITE ? v : -v; ss << "Final evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)"; ss << " [with scaled NNUE, ...]"; diff --git a/src/evaluate.h b/src/evaluate.h index da9c7074ec1..38615ff7d68 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -40,14 +40,16 @@ constexpr inline int SmallNetThreshold = 1274, PsqtOnlyThreshold = 2389; namespace NNUE { struct Networks; +struct AccumulatorCaches; } std::string trace(Position& pos, const Eval::NNUE::Networks& networks); int simple_eval(const Position& pos, Color c); -Value evaluate(const NNUE::Networks& networks, const Position& pos, int optimism); - - +Value evaluate(const NNUE::Networks& networks, + const Position& pos, + Eval::NNUE::AccumulatorCaches& caches, + int optimism); } // namespace Eval } // namespace Stockfish diff --git a/src/nnue/features/half_ka_v2_hm.cpp b/src/nnue/features/half_ka_v2_hm.cpp index 5789db4844a..71782a7b731 100644 --- a/src/nnue/features/half_ka_v2_hm.cpp +++ b/src/nnue/features/half_ka_v2_hm.cpp @@ -23,7 +23,7 @@ #include "../../bitboard.h" #include "../../position.h" #include "../../types.h" -#include "../nnue_common.h" +#include "../nnue_accumulator.h" namespace Stockfish::Eval::NNUE::Features { @@ -49,6 +49,8 @@ void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active) // Explicit template instantiations template void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active); template void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active); +template IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq); +template IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq); // Get a list of indices for recently changed features template diff --git a/src/nnue/features/half_ka_v2_hm.h b/src/nnue/features/half_ka_v2_hm.h index 8363184f430..96349704745 100644 --- a/src/nnue/features/half_ka_v2_hm.h +++ b/src/nnue/features/half_ka_v2_hm.h @@ -63,10 +63,6 @@ class HalfKAv2_hm { {PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_KING, PS_NONE, PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE}}; - // Index of a feature for a given king position and another piece on some square - template - static IndexType make_index(Square s, Piece pc, Square ksq); - public: // Feature name static constexpr const char* Name = "HalfKAv2_hm(Friend)"; @@ -126,6 +122,10 @@ class HalfKAv2_hm { static constexpr IndexType MaxActiveDimensions = 32; using IndexList = ValueList; + // Index of a feature for a given king position and another piece on some square + template + static IndexType make_index(Square s, Piece pc, Square ksq); + // Get a list of indices for active features template static void append_active_indices(const Position& pos, IndexList& active); diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index bea3e7cb398..656ad97a1e3 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -186,10 +186,11 @@ bool Network::save(const std::optional& filename template -Value Network::evaluate(const Position& pos, - bool adjusted, - int* complexity, - bool psqtOnly) const { +Value Network::evaluate(const Position& pos, + AccumulatorCaches::Cache* cache, + bool adjusted, + int* complexity, + bool psqtOnly) const { // We manually align the arrays on the stack because with gcc < 9.3 // overaligning stack variables with alignas() doesn't work correctly. @@ -197,20 +198,21 @@ Value Network::evaluate(const Position& pos, constexpr int delta = 24; #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) - TransformedFeatureType transformedFeaturesUnaligned - [FeatureTransformer::BufferSize - + alignment / sizeof(TransformedFeatureType)]; + TransformedFeatureType + transformedFeaturesUnaligned[FeatureTransformer::BufferSize + + alignment / sizeof(TransformedFeatureType)]; auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); #else - alignas(alignment) TransformedFeatureType transformedFeatures - [FeatureTransformer::BufferSize]; + alignas(alignment) TransformedFeatureType + transformedFeatures[FeatureTransformer::BufferSize]; #endif ASSERT_ALIGNED(transformedFeatures, alignment); const int bucket = (pos.count() - 1) / 4; - const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket, psqtOnly); + const auto psqt = + featureTransformer->transform(pos, cache, transformedFeatures, bucket, psqtOnly); const auto positional = !psqtOnly ? (network[bucket]->propagate(transformedFeatures)) : 0; if (complexity) @@ -255,26 +257,29 @@ void Network::verify(std::string evalfilePath) const { template -void Network::hint_common_access(const Position& pos, bool psqtOnl) const { - featureTransformer->hint_common_access(pos, psqtOnl); +void Network::hint_common_access(const Position& pos, + AccumulatorCaches::Cache* cache, + bool psqtOnl) const { + featureTransformer->hint_common_access(pos, cache, psqtOnl); } - template -NnueEvalTrace Network::trace_evaluate(const Position& pos) const { +NnueEvalTrace +Network::trace_evaluate(const Position& pos, + AccumulatorCaches::Cache* cache) const { // We manually align the arrays on the stack because with gcc < 9.3 // overaligning stack variables with alignas() doesn't work correctly. constexpr uint64_t alignment = CacheLineSize; #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) - TransformedFeatureType transformedFeaturesUnaligned - [FeatureTransformer::BufferSize - + alignment / sizeof(TransformedFeatureType)]; + TransformedFeatureType + transformedFeaturesUnaligned[FeatureTransformer::BufferSize + + alignment / sizeof(TransformedFeatureType)]; auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); #else - alignas(alignment) TransformedFeatureType transformedFeatures - [FeatureTransformer::BufferSize]; + alignas(alignment) TransformedFeatureType + transformedFeatures[FeatureTransformer::BufferSize]; #endif ASSERT_ALIGNED(transformedFeatures, alignment); @@ -284,7 +289,7 @@ NnueEvalTrace Network::trace_evaluate(const Position& pos) co for (IndexType bucket = 0; bucket < LayerStacks; ++bucket) { const auto materialist = - featureTransformer->transform(pos, transformedFeatures, bucket, false); + featureTransformer->transform(pos, cache, transformedFeatures, bucket, false); const auto positional = network[bucket]->propagate(transformedFeatures); t.psqt[bucket] = static_cast(materialist / OutputScale); diff --git a/src/nnue/network.h b/src/nnue/network.h index 21e1c622205..df59732d955 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -31,10 +31,10 @@ #include "nnue_architecture.h" #include "nnue_feature_transformer.h" #include "nnue_misc.h" +#include "nnue_accumulator.h" namespace Stockfish::Eval::NNUE { - enum class EmbeddedNNUEType { BIG, SMALL, @@ -43,6 +43,8 @@ enum class EmbeddedNNUEType { template class Network { + static constexpr IndexType FTDimensions = Arch::TransformedFeatureDimensions; + public: Network(EvalFile file, EmbeddedNNUEType type) : evalFile(file), @@ -51,17 +53,20 @@ class Network { void load(const std::string& rootDirectory, std::string evalfilePath); bool save(const std::optional& filename) const; + Value evaluate(const Position& pos, + AccumulatorCaches::Cache* cache, + bool adjusted = false, + int* complexity = nullptr, + bool psqtOnly = false) const; - Value evaluate(const Position& pos, - bool adjusted = false, - int* complexity = nullptr, - bool psqtOnly = false) const; - - void hint_common_access(const Position& pos, bool psqtOnl) const; + void hint_common_access(const Position& pos, + AccumulatorCaches::Cache* cache, + bool psqtOnl) const; void verify(std::string evalfilePath) const; - NnueEvalTrace trace_evaluate(const Position& pos) const; + NnueEvalTrace trace_evaluate(const Position& pos, + AccumulatorCaches::Cache* cache) const; private: void load_user_net(const std::string&, const std::string&); @@ -89,6 +94,9 @@ class Network { // Hash value of evaluation function structure static constexpr std::uint32_t hash = Transformer::get_hash_value() ^ Arch::get_hash_value(); + + template + friend struct AccumulatorCaches::Cache; }; // Definitions of the network types diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index c0746b4ee86..8d73dbef5ad 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -28,13 +28,75 @@ namespace Stockfish::Eval::NNUE { +using BiasType = std::int16_t; +using PSQTWeightType = std::int32_t; +using IndexType = std::uint32_t; + // Class that holds the result of affine transformation of input features template struct alignas(CacheLineSize) Accumulator { - std::int16_t accumulation[2][Size]; - std::int32_t psqtAccumulation[2][PSQTBuckets]; - bool computed[2]; - bool computedPSQT[2]; + std::int16_t accumulation[COLOR_NB][Size]; + std::int32_t psqtAccumulation[COLOR_NB][PSQTBuckets]; + bool computed[COLOR_NB]; + bool computedPSQT[COLOR_NB]; +}; + + +// AccumulatorCaches struct provides per-thread accumulator caches, where each +// cache contains multiple entries for each of the possible king squares. +// When the accumulator needs to be refreshed, the cached entry is used to more +// efficiently update the accumulator, instead of rebuilding it from scratch. +// This idea, was first described by Luecx (author of Koivisto) and +// is commonly referred to as "Finny Tables". +struct AccumulatorCaches { + + template + struct alignas(CacheLineSize) Cache { + + struct alignas(CacheLineSize) Entry { + BiasType accumulation[COLOR_NB][Size]; + PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets]; + Bitboard byColorBB[COLOR_NB][COLOR_NB]; + Bitboard byTypeBB[COLOR_NB][PIECE_TYPE_NB]; + + // To initialize a refresh entry, we set all its bitboards empty, + // so we put the biases in the accumulation, without any weights on top + void clear(const BiasType* biases) { + + std::memset(byColorBB, 0, sizeof(byColorBB)); + std::memset(byTypeBB, 0, sizeof(byTypeBB)); + + std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType)); + std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType)); + + std::memset(psqtAccumulation, 0, sizeof(psqtAccumulation)); + } + }; + + template + void clear(const Network& network) { + for (auto& entry : entries) + entry.clear(network.featureTransformer->biases); + } + + void clear(const BiasType* biases) { + for (auto& entry : entries) + entry.clear(biases); + } + + Entry& operator[](Square sq) { return entries[sq]; } + + std::array entries; + }; + + template + void clear(const Networks& networks) { + big.clear(networks.big); + } + + // When adding a new cache for a network, i.e. the smallnet + // the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh. + Cache big; }; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 0a0f4217fdb..88f0e4031a4 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -195,10 +195,10 @@ template StateInfo::*accPtr> class FeatureTransformer { - private: // Number of output dimensions for one side static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; + private: #ifdef VECTOR static constexpr int NumRegs = BestRegisterCount(); @@ -306,10 +306,13 @@ class FeatureTransformer { } // Convert input features - std::int32_t - transform(const Position& pos, OutputType* output, int bucket, bool psqtOnly) const { - update_accumulator(pos, psqtOnly); - update_accumulator(pos, psqtOnly); + std::int32_t transform(const Position& pos, + AccumulatorCaches::Cache* cache, + OutputType* output, + int bucket, + bool psqtOnly) const { + update_accumulator(pos, cache, psqtOnly); + update_accumulator(pos, cache, psqtOnly); const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; const auto& psqtAccumulation = (pos.state()->*accPtr).psqtAccumulation; @@ -371,9 +374,11 @@ class FeatureTransformer { return psqt; } // end of function transform() - void hint_common_access(const Position& pos, bool psqtOnly) const { - hint_common_access_for_perspective(pos, psqtOnly); - hint_common_access_for_perspective(pos, psqtOnly); + void hint_common_access(const Position& pos, + AccumulatorCaches::Cache* cache, + bool psqtOnly) const { + hint_common_access_for_perspective(pos, cache, psqtOnly); + hint_common_access_for_perspective(pos, cache, psqtOnly); } private: @@ -650,7 +655,161 @@ class FeatureTransformer { } template - void update_accumulator_refresh(const Position& pos, bool psqtOnly) const { + void update_accumulator_refresh_cache(const Position& pos, + AccumulatorCaches::Cache* cache) const { + assert(cache != nullptr); + + Square ksq = pos.square(Perspective); + + auto& entry = (*cache)[ksq]; + + auto& accumulator = pos.state()->*accPtr; + accumulator.computed[Perspective] = true; + accumulator.computedPSQT[Perspective] = true; + + FeatureSet::IndexList removed, added; + for (Color c : {WHITE, BLACK}) + { + for (PieceType pt = PAWN; pt <= KING; ++pt) + { + const Piece piece = make_piece(c, pt); + const Bitboard oldBB = + entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt]; + const Bitboard newBB = pos.pieces(c, pt); + Bitboard toRemove = oldBB & ~newBB; + Bitboard toAdd = newBB & ~oldBB; + + while (toRemove) + { + Square sq = pop_lsb(toRemove); + removed.push_back(FeatureSet::make_index(sq, piece, ksq)); + } + while (toAdd) + { + Square sq = pop_lsb(toAdd); + added.push_back(FeatureSet::make_index(sq, piece, ksq)); + } + } + } + +#ifdef VECTOR + vec_t acc[NumRegs]; + psqt_vec_t psqt[NumPsqtRegs]; + + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) + { + auto entryTile = + reinterpret_cast(&entry.accumulation[Perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = entryTile[k]; + + for (int i = 0; i < int(added.size()); ++i) + { + IndexType index = added[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + for (int i = 0; i < int(removed.size()); ++i) + { + IndexType index = removed[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + + for (IndexType k = 0; k < NumRegs; k++) + vec_store(&entryTile[k], acc[k]); + } + + for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) + { + auto entryTilePsqt = reinterpret_cast( + &entry.psqtAccumulation[Perspective][j * PsqtTileHeight]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = entryTilePsqt[k]; + + for (int i = 0; i < int(added.size()); ++i) + { + IndexType index = added[i]; + const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + } + for (int i = 0; i < int(removed.size()); ++i) + { + IndexType index = removed[i]; + const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + } + + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + vec_store_psqt(&entryTilePsqt[k], psqt[k]); + } + +#else + + for (const auto index : added) + { + const IndexType offset = HalfDimensions * index; + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[Perspective][j] += weights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + } + for (const auto index : removed) + { + const IndexType offset = HalfDimensions * index; + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[Perspective][j] -= weights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; + } + +#endif + + // The accumulator of the refresh entry has been updated. + // Now copy its content to the actual accumulator we were refreshing + + std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective], + sizeof(int32_t) * PSQTBuckets); + + std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective], + sizeof(BiasType) * HalfDimensions); + + for (Color c : {WHITE, BLACK}) + entry.byColorBB[Perspective][c] = pos.pieces(c); + + for (PieceType pt = PAWN; pt <= KING; ++pt) + entry.byTypeBB[Perspective][pt] = pos.pieces(pt); + } + + template + void + update_accumulator_refresh(const Position& pos, + [[maybe_unused]] AccumulatorCaches::Cache* cache, + bool psqtOnly) const { + + // When we are refreshing the accumulator of the big net, + // redirect to the version of refresh that uses the refresh table. + // Using the cache for the small net is not beneficial. + if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig) + { + update_accumulator_refresh_cache(pos, cache); + return; + } + #ifdef VECTOR // Gcc-10.2 unnecessarily spills AVX2 registers if this array // is defined in the VECTOR code below, once in each branch @@ -764,7 +923,9 @@ class FeatureTransformer { } template - void hint_common_access_for_perspective(const Position& pos, bool psqtOnly) const { + void hint_common_access_for_perspective(const Position& pos, + AccumulatorCaches::Cache* cache, + bool psqtOnly) const { // Works like update_accumulator, but performs less work. // Updates ONLY the accumulator for pos. @@ -787,11 +948,13 @@ class FeatureTransformer { psqtOnly); } else - update_accumulator_refresh(pos, psqtOnly); + update_accumulator_refresh(pos, cache, psqtOnly); } template - void update_accumulator(const Position& pos, bool psqtOnly) const { + void update_accumulator(const Position& pos, + AccumulatorCaches::Cache* cache, + bool psqtOnly) const { auto [oldest_st, next] = try_find_computed_accumulator(pos, psqtOnly); @@ -813,9 +976,12 @@ class FeatureTransformer { psqtOnly); } else - update_accumulator_refresh(pos, psqtOnly); + update_accumulator_refresh(pos, cache, psqtOnly); } + template + friend struct AccumulatorCaches::Cache; + alignas(CacheLineSize) BiasType biases[HalfDimensions]; alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions]; alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets]; diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp index 3fa6e1b6180..51838fefa44 100644 --- a/src/nnue/nnue_misc.cpp +++ b/src/nnue/nnue_misc.cpp @@ -42,13 +42,15 @@ namespace Stockfish::Eval::NNUE { constexpr std::string_view PieceToChar(" PNBRQK pnbrqk"); -void hint_common_parent_position(const Position& pos, const Networks& networks) { +void hint_common_parent_position(const Position& pos, + const Networks& networks, + AccumulatorCaches& caches) { int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move())); if (simpleEvalAbs > Eval::SmallNetThreshold) - networks.small.hint_common_access(pos, simpleEvalAbs > Eval::PsqtOnlyThreshold); + networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold); else - networks.big.hint_common_access(pos, false); + networks.big.hint_common_access(pos, &caches.big, false); } namespace { @@ -104,7 +106,8 @@ void format_cp_aligned_dot(Value v, std::stringstream& stream, const Position& p // Returns a string with the value of each piece on a board, // and a table for (PSQT, Layers) values bucket by bucket. -std::string trace(Position& pos, const Eval::NNUE::Networks& networks) { +std::string +trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::AccumulatorCaches& caches) { std::stringstream ss; @@ -130,7 +133,7 @@ std::string trace(Position& pos, const Eval::NNUE::Networks& networks) { // We estimate the value of each piece by doing a differential evaluation from // the current base eval, simulating the removal of the piece from its square. - Value base = networks.big.evaluate(pos); + Value base = networks.big.evaluate(pos, &caches.big); base = pos.side_to_move() == WHITE ? base : -base; for (File f = FILE_A; f <= FILE_H; ++f) @@ -149,7 +152,7 @@ std::string trace(Position& pos, const Eval::NNUE::Networks& networks) { st->accumulatorBig.computedPSQT[WHITE] = st->accumulatorBig.computedPSQT[BLACK] = false; - Value eval = networks.big.evaluate(pos); + Value eval = networks.big.evaluate(pos, &caches.big); eval = pos.side_to_move() == WHITE ? eval : -eval; v = base - eval; @@ -167,7 +170,7 @@ std::string trace(Position& pos, const Eval::NNUE::Networks& networks) { ss << board[row] << '\n'; ss << '\n'; - auto t = networks.big.trace_evaluate(pos); + auto t = networks.big.trace_evaluate(pos, &caches.big); ss << " NNUE network contributions " << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl diff --git a/src/nnue/nnue_misc.h b/src/nnue/nnue_misc.h index 5eab02184c6..27a93f88435 100644 --- a/src/nnue/nnue_misc.h +++ b/src/nnue/nnue_misc.h @@ -50,12 +50,13 @@ struct NnueEvalTrace { std::size_t correctBucket; }; - struct Networks; +struct AccumulatorCaches; - -std::string trace(Position& pos, const Networks& networks); -void hint_common_parent_position(const Position& pos, const Networks& networks); +std::string trace(Position& pos, const Networks& networks, AccumulatorCaches& caches); +void hint_common_parent_position(const Position& pos, + const Networks& networks, + AccumulatorCaches& caches); } // namespace Stockfish::Eval::NNUE } // namespace Stockfish diff --git a/src/search.cpp b/src/search.cpp index 6813c1a5f61..7ff94128eb1 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -33,6 +33,8 @@ #include "misc.h" #include "movegen.h" #include "movepick.h" +#include "nnue/network.h" +#include "nnue/nnue_accumulator.h" #include "nnue/nnue_common.h" #include "nnue/nnue_misc.h" #include "position.h" @@ -135,6 +137,7 @@ Search::Worker::Worker(SharedState& sharedState, // Unpack the SharedState struct into member variables thread_idx(thread_id), manager(std::move(sm)), + refreshTable(), options(sharedState.options), threads(sharedState.threads), tt(sharedState.tt), @@ -143,6 +146,10 @@ Search::Worker::Worker(SharedState& sharedState, } void Search::Worker::start_searching() { + + // Initialize accumulator refresh entries + refreshTable.clear(networks); + // Non-main threads go directly to iterative_deepening() if (!is_mainthread()) { @@ -564,7 +571,7 @@ Value Search::Worker::search( if (threads.stop.load(std::memory_order_relaxed) || pos.is_draw(ss->ply) || ss->ply >= MAX_PLY) return (ss->ply >= MAX_PLY && !ss->inCheck) - ? evaluate(networks, pos, thisThread->optimism[us]) + ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) : value_draw(thisThread->nodes); // Step 3. Mate distance pruning. Even if we mate at the next move our score @@ -698,7 +705,7 @@ Value Search::Worker::search( { // Providing the hint that this node's accumulator will be used often // brings significant Elo gain (~13 Elo). - Eval::NNUE::hint_common_parent_position(pos, networks); + Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); unadjustedStaticEval = eval = ss->staticEval; } else if (ss->ttHit) @@ -706,9 +713,9 @@ Value Search::Worker::search( // Never assume anything about values stored in TT unadjustedStaticEval = tte->eval(); if (unadjustedStaticEval == VALUE_NONE) - unadjustedStaticEval = evaluate(networks, pos, thisThread->optimism[us]); + unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]); else if (PvNode) - Eval::NNUE::hint_common_parent_position(pos, networks); + Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); @@ -718,7 +725,7 @@ Value Search::Worker::search( } else { - unadjustedStaticEval = evaluate(networks, pos, thisThread->optimism[us]); + unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]); ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); // Static evaluation is saved as it was before adjustment by correction history @@ -875,7 +882,7 @@ Value Search::Worker::search( } } - Eval::NNUE::hint_common_parent_position(pos, networks); + Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); } moves_loop: // When in check, search starts here @@ -1413,7 +1420,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, // Step 2. Check for an immediate draw or maximum ply reached if (pos.is_draw(ss->ply) || ss->ply >= MAX_PLY) return (ss->ply >= MAX_PLY && !ss->inCheck) - ? evaluate(networks, pos, thisThread->optimism[us]) + ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) : VALUE_DRAW; assert(0 <= ss->ply && ss->ply < MAX_PLY); @@ -1445,7 +1452,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, // Never assume anything about values stored in TT unadjustedStaticEval = tte->eval(); if (unadjustedStaticEval == VALUE_NONE) - unadjustedStaticEval = evaluate(networks, pos, thisThread->optimism[us]); + unadjustedStaticEval = + evaluate(networks, pos, refreshTable, thisThread->optimism[us]); ss->staticEval = bestValue = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); @@ -1458,7 +1466,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, { // In case of null move search, use previous static eval with a different sign unadjustedStaticEval = (ss - 1)->currentMove != Move::null() - ? evaluate(networks, pos, thisThread->optimism[us]) + ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) : -(ss - 1)->staticEval; ss->staticEval = bestValue = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); diff --git a/src/search.h b/src/search.h index 3ceaf5ddd0b..0fd778b47e6 100644 --- a/src/search.h +++ b/src/search.h @@ -26,9 +26,9 @@ #include #include #include +#include #include #include -#include #include "misc.h" #include "movepick.h" @@ -37,6 +37,7 @@ #include "syzygy/tbprobe.h" #include "timeman.h" #include "types.h" +#include "nnue/nnue_accumulator.h" namespace Stockfish { @@ -301,6 +302,10 @@ class Worker { Tablebases::Config tbConfig; + // Used by NNUE + + Eval::NNUE::AccumulatorCaches refreshTable; + const OptionsMap& options; ThreadPool& threads; TranspositionTable& tt;