|
io-chess
UCI chess engine
|
Factorized Mixture of Experts (MoE) neural network architecture and inference components. More...
#include <algorithm>#include <array>#include <atomic>#include <cassert>#include <chrono>#include <cmath>#include <condition_variable>#include <cstdint>#include <cstdlib>#include <cstring>#include <functional>#include <iomanip>#include <iostream>#include <limits>#include <memory>#include <mutex>#include <queue>#include <random>#include <string>#include <thread>#include <type_traits>#include <utility>#include <vector>#include <chess.hpp>#include "FactorizedFeatureExtractor.hpp"#include "PersistentThreadPool.hpp"

Go to the source code of this file.
Classes | |
| struct | BenchConfig |
| struct | BenchResult |
| struct | BranchLayer |
| struct | Branch |
| struct | Expert |
| struct | SharedMoEWeights |
| Contains the globally shared, read-only weights for the Factorized MoE network. More... | |
| struct | MoEDoubleAccumulator |
| Thread-local state for incremental, lightning-fast neural network inference. More... | |
| struct | MoEDoubleAccumulator::PhaseProfile |
Macros | |
| #define | FORCE_VECTORIZE |
| #define | HOT_RESTRICT __restrict__ |
Typedefs | |
| using | Clock = std::chrono::high_resolution_clock |
Enumerations | |
| enum class | ExpertPoolMode { Flat , Gap , Pool2x2Avg , Pool2x2Max } |
| Defines how spatial features are pooled before entering the fully-connected expert networks. More... | |
Functions | |
| static double | us (Clock::time_point a, Clock::time_point b) |
| static bool | is_slider_branch_rich (int branch_idx) |
| Determines if a specific branch corresponds to a slider piece type (Bishop, Rook, Queen). | |
| static int | active_channels_for_branch (int branch_idx) |
| Returns the number of active input feature channels for a given branch. | |
| static bool | is_slider_dirty (const Board &board, const Board &old_board, Color side, PieceType pt) |
| Checks if a slider piece type (Bishop, Rook, Queen) has become "dirty". | |
| static void | mark_head (std::array< uint8_t, 12 > &m, Color s, PieceType pt) |
| Marks a specific piece type for a specific color as "dirty" in the dirty mask. | |
| static std::array< uint8_t, 12 > | build_dirty_mask (const Board &old_board, const Board &new_board, const Move &mv) |
| Analyzes a chess move to determine exactly which of the 12 spatial branches need recomputing. | |
| static bool | branch_planes_changed (const FactorizedInput &a, const FactorizedInput &b, int branch_idx) |
| Performs a fast byte-level comparison to see if two spatial branch inputs are identical. | |
| static void | simd_add_scaled (float *HOT_RESTRICT dst, const float *HOT_RESTRICT src, float scale, int n) |
| SIMD-optimized fused multiply-add (FMA) for adding a scaled vector to a destination vector. dst[i] += src[i] * scale. | |
| template<typename T> | |
| static T * | assume_aligned_32 (T *ptr) |
| Hints the compiler that a pointer is aligned to a 32-byte boundary (for AVX operations). | |
| template<typename T> | |
| static const T * | assume_aligned_32 (const T *ptr) |
| static void | conv3x3_accumulate_plane (float *HOT_RESTRICT out_plane, const float *HOT_RESTRICT in_plane, const float *HOT_RESTRICT wk) |
| Computes a full 3x3 2D convolution for a single input channel and accumulates it into out_plane. | |
| static void | conv3x3_single_out_relu (const float *HOT_RESTRICT in, const float *HOT_RESTRICT w, float b, float *HOT_RESTRICT out, int ic) |
| Computes a 3x3 convolution across multiple input channels (ic) to produce a single output channel, followed by ReLU. | |
| static void | conv3x3_relu (const float *HOT_RESTRICT in, const float *HOT_RESTRICT w, const float *HOT_RESTRICT b, float *HOT_RESTRICT out, int ic, int oc) |
| Computes a full 3x3 convolution from ic input channels to oc output channels, followed by ReLU. | |
| static void | conv3x3_relu_bd16_dispatch (const float *HOT_RESTRICT in, const float *HOT_RESTRICT w, const float *HOT_RESTRICT b, float *HOT_RESTRICT out, int ic) |
| static void | depthwise_conv3x3_relu (const float *HOT_RESTRICT in, const float *HOT_RESTRICT w, const float *HOT_RESTRICT b, float *HOT_RESTRICT out, int channels) |
| Computes a depthwise 3x3 convolution followed by a ReLU activation. | |
| static void | conv1x1_relu (const float *HOT_RESTRICT in, const float *HOT_RESTRICT w, const float *HOT_RESTRICT b, float *HOT_RESTRICT out, int ic, int oc) |
| Computes a 1x1 convolution (pointwise convolution) across the spatial grid, followed by ReLU. | |
| static const char * | expert_pool_mode_name (ExpertPoolMode mode) |
| static int | pool2x2_region_from_sq (int sq) |
| static void | pool2x2_region_base (int region, int &r0, int &c0) |
Variables | |
| static constexpr int | kDefaultBranchDim = 16 |
| static constexpr int | kDefaultMixerOut = 64 |
| static constexpr int | kDefaultBypass = 12 |
| static constexpr int | kDefaultGlobals = 21 |
| static constexpr int | kDefaultExperts = 4 |
| static constexpr int | kDefaultExpertBottleneck = 32 |
| static constexpr int | kDefaultExpertHidden = 128 |
| static constexpr int | NET_BRANCH_DIM = 16 |
| static constexpr int | NET_MIXER_OUT = 64 |
| static constexpr int | NET_BYPASS = 12 |
| static constexpr int | NET_GLOBALS = 21 |
| static constexpr int | NET_EXPERTS = 4 |
| static constexpr int | NET_EXPERT_BOTTLENECK = 32 |
| static constexpr int | NET_EXPERT_HIDDEN = 128 |
| static constexpr int | kInputBypassPlanes = 12 |
| static constexpr int | kInputGlobals = 32 |
| static constexpr int | kMaxBranchDim = NET_BRANCH_DIM |
| static constexpr int | kMaxMixerOut = NET_MIXER_OUT |
| static constexpr int | kMaxBypass = NET_BYPASS |
| static constexpr int | kMaxGlobals = NET_GLOBALS |
| static constexpr int | kMaxExperts = NET_EXPERTS |
| static constexpr int | kMaxExpertBottleneck = NET_EXPERT_BOTTLENECK |
| static constexpr int | kMaxExpertHidden = NET_EXPERT_HIDDEN |
| static constexpr int | kMixerOcTile = 8 |
| static constexpr int | kPool2x2Regions = 16 |
Factorized Mixture of Experts (MoE) neural network architecture and inference components.
This file implements the core inference structures for the natively-compiled Factorized MoE network used for position evaluation. The architecture processes a custom 12-branch spatial feature set extracted via FactorizedFeatureExtractor, applies a mixer layer, routes the evaluation to specialized expert networks, and computes the final Win/Draw/Loss probabilities.
For maximum performance, this file uses a double accumulator pattern (MoEDoubleAccumulator):
| #define FORCE_VECTORIZE |
| #define HOT_RESTRICT __restrict__ |
| using Clock = std::chrono::high_resolution_clock |
|
strong |
|
inlinestatic |
Returns the number of active input feature channels for a given branch.
| branch_idx | The index of the branch. |


|
inlinestatic |
|
inlinestatic |
Hints the compiler that a pointer is aligned to a 32-byte boundary (for AVX operations).

|
inlinestatic |
Performs a fast byte-level comparison to see if two spatial branch inputs are identical.
| a | The old factorized input. |
| b | The new factorized input. |
| branch_idx | The index of the branch to check. |


|
static |
Analyzes a chess move to determine exactly which of the 12 spatial branches need recomputing.
This is the core of the incremental update logic. It prevents full network re-evaluations by figuring out which specific piece types (e.g., White Pawns, Black Knights) had their board representation changed by the move, including complex cases like discoveries, castling, and en passant.
| old_board | The board state before the move. |
| new_board | The board state after the move. |
| mv | The move that was just played. |

|
static |
Computes a 1x1 convolution (pointwise convolution) across the spatial grid, followed by ReLU.
This operation mixes features across channels for every spatial square independently.
| in | Flattened input tensor [ic][64]. |
| w | Flattened weights [oc][ic]. |
| b | Flattened biases [oc]. |
| out | Flattened output tensor [oc][64]. |
| ic | Number of input channels. |
| oc | Number of output channels. |


|
inlinestatic |
Computes a full 3x3 2D convolution for a single input channel and accumulates it into out_plane.
This avoids boundary checks by breaking the 8x8 chessboard into regions (Center, North/South edges, Corners).
| out_plane | The destination 64-element array. |
| in_plane | The source 64-element array representing the spatial features. |
| wk | A 9-element array representing the 3x3 convolution weights. |

|
static |
Computes a full 3x3 convolution from ic input channels to oc output channels, followed by ReLU.
| in | Flattened input tensor [ic][64]. |
| w | Flattened weights [oc][ic][9]. |
| b | Flattened biases [oc]. |
| out | Flattened output tensor [oc][64]. |
| ic | Number of input channels. |
| oc | Number of output channels. |


|
inlinestatic |


|
inlinestatic |
Computes a 3x3 convolution across multiple input channels (ic) to produce a single output channel, followed by ReLU.
| in | The flattened input tensor [ic][64]. |
| w | The flattened weights [ic][9]. |
| b | The bias scalar. |
| out | The destination 64-element array for the output channel. |
| ic | Number of input channels. |


|
inlinestatic |
Computes a depthwise 3x3 convolution followed by a ReLU activation.
In a depthwise convolution, each input channel is convolved with its own set of spatial weights, producing exactly one output channel per input channel, without mixing information across channels.
| in | Flattened input tensor [channels][64]. |
| w | Flattened depthwise weights [channels][9]. |
| b | Flattened biases [channels]. |
| out | Flattened output tensor [channels][64]. |
| channels | Number of channels (both input and output). |


|
inlinestatic |
|
inlinestatic |
Determines if a specific branch corresponds to a slider piece type (Bishop, Rook, Queen).
| branch_idx | The index of the branch (0-11). |

|
static |
Checks if a slider piece type (Bishop, Rook, Queen) has become "dirty".
A slider is dirty if any piece of that type moved, or if the board occupancy changed in a way that intersects with any of the slider's rays (blocking or unblocking its attack path).
| board | The new board state. |
| old_board | The previous board state. |
| side | The color of the slider pieces to check. |
| pt | The piece type (BISHOP, ROOK, or QUEEN). |

Marks a specific piece type for a specific color as "dirty" in the dirty mask.
| m | The 12-element boolean array mapping to the 12 spatial branches. |
| s | The color of the piece. |
| pt | The type of the piece. |

|
inlinestatic |

|
inlinestatic |

|
inlinestatic |
SIMD-optimized fused multiply-add (FMA) for adding a scaled vector to a destination vector. dst[i] += src[i] * scale.
| dst | The accumulator array. |
| src | The source array to scale and add. |
| scale | The scalar multiplier. |
| n | Number of elements. |

|
inlinestatic |

|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |