io-chess
UCI chess engine
Loading...
Searching...
No Matches
MoEDoubleAccumulator Struct Reference

Thread-local state for incremental, lightning-fast neural network inference. More...

#include <MoECacheModel.hpp>

Collaboration diagram for MoEDoubleAccumulator:

Classes

struct  PhaseProfile

Public Member Functions

const SharedMoEWeightsshared_weights () const
SharedMoEWeightsmutable_owned_weights ()
void reset_runtime_state ()
void copy_weights_from (const MoEDoubleAccumulator &src)
void reset_profile ()
template<typename Fn>
void parallel_for_indices (int n, int min_parallel_n, Fn &&fn)
template<typename Fn>
void parallel_for_indices (int n, Fn &&fn)
long long total_weights () const
long long single_expert_weights () const
long long experts_total_weights () const
long long backbone_weights () const
long long runtime_topk_weights (int topk) const
void init (const SharedMoEWeights *shared, const BenchConfig &cfg)
void init (const BenchConfig &cfg)
void fill_random (unsigned seed)
void branch_forward_bd16_fast (const Branch &br, const float *HOT_RESTRICT in_planes, float *HOT_RESTRICT out, float *HOT_RESTRICT mid_plane, float *HOT_RESTRICT l1_accum)
void branch_forward_with_scratch (int b, const float *in_planes, float *out, float *scratch0, float *scratch1)
void branch_forward (int b, const float *in_planes, float *out)
void rebuild_hidden_acc_from_flat (int e)
void rebuild_hidden_acc_from_gap (int e)
void rebuild_hidden_acc_from_pool2x2 (int e, bool max_pool)
float global_proj_at (int oc, const float *g) const
void top2_experts (const float *global, int &e0, int &e1, float &w0, float &w1) const
void rebuild_expert_cache_from_mixer (int e)
void full_rebuild_accumulators (const FactorizedInput &inp, const int *active_experts, int active_count)
 Performs a full forward pass of the model, discarding all cache.
void update_incremental (const FactorizedInput &cur, const FactorizedInput &prev, const int *dirty_branches, int dirty_count, const int *active_experts, int active_count)
 Performs an incremental network update by computing and applying only the differences.
void run_active_expert (int e, float out_wdl[3])
 Computes the final hidden layer and WDL output for a single expert.
void run_top2_experts (int e0, int e1, float w0, float w1, float out_wdl[3])
 Combines the output of the top 2 routed experts based on their routing weights.

Static Public Member Functions

static void validate_fixed_architecture (const BenchConfig &cfg)

Public Attributes

int branchConvLayers = 3
int nThreads = 1
int minParallelDirtyHeads = 4
int minParallelActiveExperts = 3
int denseDirtySqThreshold = 16
ExpertPoolMode expertPoolMode = ExpertPoolMode::Pool2x2Avg
bool routeSlowGlobals = false
const SharedMoEWeightsweights = nullptr
std::shared_ptr< SharedMoEWeightsownedWeights {}
std::unique_ptr< PersistentThreadPoolthreadPool
std::array< float,(size_t) 12 *kMaxBranchDim *64 > branchCache {}
std::array< float,(size_t) kMaxMixerOut *64 > mixerLinearAccum {}
std::array< float,(size_t) kMaxMixerOut *64 > mixerReluCache {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExpertsexPreAccum {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExpertsexReluCache {}
std::array< uint8_t, kMaxExpertsexValid {}
std::array< std::array< float, kMaxExpertHidden >, kMaxExpertshiddenAcc {}
std::array< std::array< float, kMaxExpertBottleneck >, kMaxExpertsexGapCache {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *kPool2x2Regions >, kMaxExpertsexPool16Cache {}
std::array< float, kMaxGlobalsoldGlobalV {}
std::array< float,(size_t) kMaxBranchDim *64 > scratchT0 {}
std::array< float,(size_t) kMaxBranchDim *64 > scratchT1 {}
std::array< float,(size_t) kMaxBranchDim *64 > scratchNewBranch {}
std::array< std::array< float,(size_t) kMaxBranchDim *64 >, 12 > scratchParallelBranch0 {}
std::array< std::array< float,(size_t) kMaxBranchDim *64 >, 12 > scratchParallelBranch1 {}
std::array< float,(size_t) 12 *kMaxBranchDim *64 > scratchDirtyBranches {}
std::array< float,(size_t) kMaxBranchDim *64 > scratchBranchDelta {}
std::array< float,(size_t) kMaxBypass *64 > scratchBypassDelta {}
std::array< float, kMaxMixerOutscratchGproj {}
std::array< float,(size_t) kMaxMixerOut *64 > scratchDeltaRelu {}
std::array< float,(size_t) kMaxExpertBottleneck *64 > scratchFlatDelta {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExpertsscratchParallelExpertDelta {}
std::array< float, kMaxExpertHiddenscratchHidden {}
PhaseProfile profile {}
bool initialized = false

Static Public Attributes

static constexpr int bd = NET_BRANCH_DIM
static constexpr int nf = NET_MIXER_OUT
static constexpr int nBypass = NET_BYPASS
static constexpr int nGlobals = NET_GLOBALS
static constexpr int nExperts = NET_EXPERTS
static constexpr int ebo = NET_EXPERT_BOTTLENECK
static constexpr int eh = NET_EXPERT_HIDDEN

Detailed Description

Thread-local state for incremental, lightning-fast neural network inference.

The double accumulator pattern avoids running the full neural network on every position. Instead, it maintains a base state (base_ variables) corresponding to the parent position. When a move is played, it identifies which feature branches changed (e.g. only a Knight moved) and calculates the difference (dirty_branches). It then propagates only these differences through the mixer and into the active expert networks.

The update_incremental() function achieves <1 μs latency by leveraging this delta propagation.

Member Function Documentation

◆ backbone_weights()

long long MoEDoubleAccumulator::backbone_weights ( ) const
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ branch_forward()

void MoEDoubleAccumulator::branch_forward ( int b,
const float * in_planes,
float * out )
inline
Here is the call graph for this function:

◆ branch_forward_bd16_fast()

void MoEDoubleAccumulator::branch_forward_bd16_fast ( const Branch & br,
const float *HOT_RESTRICT in_planes,
float *HOT_RESTRICT out,
float *HOT_RESTRICT mid_plane,
float *HOT_RESTRICT l1_accum )
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ branch_forward_with_scratch()

void MoEDoubleAccumulator::branch_forward_with_scratch ( int b,
const float * in_planes,
float * out,
float * scratch0,
float * scratch1 )
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ copy_weights_from()

void MoEDoubleAccumulator::copy_weights_from ( const MoEDoubleAccumulator & src)
inline
Here is the call graph for this function:

◆ experts_total_weights()

long long MoEDoubleAccumulator::experts_total_weights ( ) const
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ fill_random()

void MoEDoubleAccumulator::fill_random ( unsigned seed)
inline
Here is the call graph for this function:

◆ full_rebuild_accumulators()

void MoEDoubleAccumulator::full_rebuild_accumulators ( const FactorizedInput & inp,
const int * active_experts,
int active_count )
inline

Performs a full forward pass of the model, discarding all cache.

This is used for the very first evaluation of a game (where there is no previous state to incrementally update from), or when the position changes so drastically (e.g. >6 dirty branches) that an incremental update would be slower than a full pass.

Parameters
inpThe full spatial features of the current position.
active_expertsArray containing the indices of the experts chosen by the router.
active_countThe number of active experts (usually 2).
Here is the call graph for this function:
Here is the caller graph for this function:

◆ global_proj_at()

float MoEDoubleAccumulator::global_proj_at ( int oc,
const float * g ) const
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ init() [1/2]

void MoEDoubleAccumulator::init ( const BenchConfig & cfg)
inline
Here is the call graph for this function:

◆ init() [2/2]

void MoEDoubleAccumulator::init ( const SharedMoEWeights * shared,
const BenchConfig & cfg )
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ mutable_owned_weights()

SharedMoEWeights & MoEDoubleAccumulator::mutable_owned_weights ( )
inline
Here is the caller graph for this function:

◆ parallel_for_indices() [1/2]

template<typename Fn>
void MoEDoubleAccumulator::parallel_for_indices ( int n,
Fn && fn )
inline
Here is the call graph for this function:

◆ parallel_for_indices() [2/2]

template<typename Fn>
void MoEDoubleAccumulator::parallel_for_indices ( int n,
int min_parallel_n,
Fn && fn )
inline
Here is the caller graph for this function:

◆ rebuild_expert_cache_from_mixer()

void MoEDoubleAccumulator::rebuild_expert_cache_from_mixer ( int e)
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ rebuild_hidden_acc_from_flat()

void MoEDoubleAccumulator::rebuild_hidden_acc_from_flat ( int e)
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ rebuild_hidden_acc_from_gap()

void MoEDoubleAccumulator::rebuild_hidden_acc_from_gap ( int e)
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ rebuild_hidden_acc_from_pool2x2()

void MoEDoubleAccumulator::rebuild_hidden_acc_from_pool2x2 ( int e,
bool max_pool )
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ reset_profile()

void MoEDoubleAccumulator::reset_profile ( )
inline

◆ reset_runtime_state()

void MoEDoubleAccumulator::reset_runtime_state ( )
inline
Here is the caller graph for this function:

◆ run_active_expert()

void MoEDoubleAccumulator::run_active_expert ( int e,
float out_wdl[3] )
inline

Computes the final hidden layer and WDL output for a single expert.

Takes the accumulated bottleneck state for the given expert, passes it through the expert's hidden layer (with ReLU), and multiplies by the final Win/Draw/Loss weights.

Parameters
eThe index of the expert to run.
out_wdlA 3-element float array where the un-normalized WDL logits will be stored.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ run_top2_experts()

void MoEDoubleAccumulator::run_top2_experts ( int e0,
int e1,
float w0,
float w1,
float out_wdl[3] )
inline

Combines the output of the top 2 routed experts based on their routing weights.

Evaluates both experts independently and takes a weighted average of their Win/Draw/Loss logits using the probabilities assigned by the Router gate.

Parameters
e0Index of the best expert.
e1Index of the second-best expert.
w0Routing weight (probability) for the best expert.
w1Routing weight (probability) for the second-best expert.
out_wdlArray to store the combined WDL logits.
Here is the call graph for this function:

◆ runtime_topk_weights()

long long MoEDoubleAccumulator::runtime_topk_weights ( int topk) const
inline
Here is the call graph for this function:

◆ shared_weights()

const SharedMoEWeights & MoEDoubleAccumulator::shared_weights ( ) const
inline
Here is the caller graph for this function:

◆ single_expert_weights()

long long MoEDoubleAccumulator::single_expert_weights ( ) const
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ top2_experts()

void MoEDoubleAccumulator::top2_experts ( const float * global,
int & e0,
int & e1,
float & w0,
float & w1 ) const
inline
Here is the call graph for this function:

◆ total_weights()

long long MoEDoubleAccumulator::total_weights ( ) const
inline
Here is the call graph for this function:
Here is the caller graph for this function:

◆ update_incremental()

void MoEDoubleAccumulator::update_incremental ( const FactorizedInput & cur,
const FactorizedInput & prev,
const int * dirty_branches,
int dirty_count,
const int * active_experts,
int active_count )
inline

Performs an incremental network update by computing and applying only the differences.

This is the heart of the engine's speed. By passing a list of dirty_branches, this function skips computing convolutions for pieces that haven't moved. It calculates the delta for the branches that did change, propagates that delta through the mixer layer, and updates the bottlenecks of the active experts.

Parameters
curThe features for the new position.
prevThe features for the old (parent) position.
dirty_branchesArray of branch indices that changed.
dirty_countNumber of branches that changed.
active_expertsArray containing the indices of the currently active experts.
active_countNumber of active experts.
Here is the call graph for this function:

◆ validate_fixed_architecture()

void MoEDoubleAccumulator::validate_fixed_architecture ( const BenchConfig & cfg)
inlinestatic
Here is the caller graph for this function:

Member Data Documentation

◆ bd

int MoEDoubleAccumulator::bd = NET_BRANCH_DIM
staticconstexpr

◆ branchCache

std::array<float, (size_t)12 * kMaxBranchDim * 64> MoEDoubleAccumulator::branchCache {}

◆ branchConvLayers

int MoEDoubleAccumulator::branchConvLayers = 3

◆ denseDirtySqThreshold

int MoEDoubleAccumulator::denseDirtySqThreshold = 16

◆ ebo

int MoEDoubleAccumulator::ebo = NET_EXPERT_BOTTLENECK
staticconstexpr

◆ eh

int MoEDoubleAccumulator::eh = NET_EXPERT_HIDDEN
staticconstexpr

◆ exGapCache

std::array<std::array<float, kMaxExpertBottleneck>, kMaxExperts> MoEDoubleAccumulator::exGapCache {}

◆ expertPoolMode

ExpertPoolMode MoEDoubleAccumulator::expertPoolMode = ExpertPoolMode::Pool2x2Avg

◆ exPool16Cache

std::array< std::array<float, (size_t)kMaxExpertBottleneck * kPool2x2Regions>, kMaxExperts> MoEDoubleAccumulator::exPool16Cache {}

◆ exPreAccum

std::array<std::array<float, (size_t)kMaxExpertBottleneck * 64>, kMaxExperts> MoEDoubleAccumulator::exPreAccum {}

◆ exReluCache

std::array<std::array<float, (size_t)kMaxExpertBottleneck * 64>, kMaxExperts> MoEDoubleAccumulator::exReluCache {}

◆ exValid

std::array<uint8_t, kMaxExperts> MoEDoubleAccumulator::exValid {}

◆ hiddenAcc

std::array<std::array<float, kMaxExpertHidden>, kMaxExperts> MoEDoubleAccumulator::hiddenAcc {}

◆ initialized

bool MoEDoubleAccumulator::initialized = false

◆ minParallelActiveExperts

int MoEDoubleAccumulator::minParallelActiveExperts = 3

◆ minParallelDirtyHeads

int MoEDoubleAccumulator::minParallelDirtyHeads = 4

◆ mixerLinearAccum

std::array<float, (size_t)kMaxMixerOut * 64> MoEDoubleAccumulator::mixerLinearAccum {}

◆ mixerReluCache

std::array<float, (size_t)kMaxMixerOut * 64> MoEDoubleAccumulator::mixerReluCache {}

◆ nBypass

int MoEDoubleAccumulator::nBypass = NET_BYPASS
staticconstexpr

◆ nExperts

int MoEDoubleAccumulator::nExperts = NET_EXPERTS
staticconstexpr

◆ nf

int MoEDoubleAccumulator::nf = NET_MIXER_OUT
staticconstexpr

◆ nGlobals

int MoEDoubleAccumulator::nGlobals = NET_GLOBALS
staticconstexpr

◆ nThreads

int MoEDoubleAccumulator::nThreads = 1

◆ oldGlobalV

std::array<float, kMaxGlobals> MoEDoubleAccumulator::oldGlobalV {}

◆ ownedWeights

std::shared_ptr<SharedMoEWeights> MoEDoubleAccumulator::ownedWeights {}

◆ profile

PhaseProfile MoEDoubleAccumulator::profile {}

◆ routeSlowGlobals

bool MoEDoubleAccumulator::routeSlowGlobals = false

◆ scratchBranchDelta

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchBranchDelta {}

◆ scratchBypassDelta

std::array<float, (size_t)kMaxBypass * 64> MoEDoubleAccumulator::scratchBypassDelta {}

◆ scratchDeltaRelu

std::array<float, (size_t)kMaxMixerOut * 64> MoEDoubleAccumulator::scratchDeltaRelu {}

◆ scratchDirtyBranches

std::array<float, (size_t)12 * kMaxBranchDim * 64> MoEDoubleAccumulator::scratchDirtyBranches {}

◆ scratchFlatDelta

std::array<float, (size_t)kMaxExpertBottleneck * 64> MoEDoubleAccumulator::scratchFlatDelta {}

◆ scratchGproj

std::array<float, kMaxMixerOut> MoEDoubleAccumulator::scratchGproj {}

◆ scratchHidden

std::array<float, kMaxExpertHidden> MoEDoubleAccumulator::scratchHidden {}

◆ scratchNewBranch

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchNewBranch {}

◆ scratchParallelBranch0

std::array<std::array<float, (size_t)kMaxBranchDim * 64>, 12> MoEDoubleAccumulator::scratchParallelBranch0 {}

◆ scratchParallelBranch1

std::array<std::array<float, (size_t)kMaxBranchDim * 64>, 12> MoEDoubleAccumulator::scratchParallelBranch1 {}

◆ scratchParallelExpertDelta

std::array<std::array<float, (size_t)kMaxExpertBottleneck * 64>, kMaxExperts> MoEDoubleAccumulator::scratchParallelExpertDelta {}

◆ scratchT0

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchT0 {}

◆ scratchT1

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchT1 {}

◆ threadPool

std::unique_ptr<PersistentThreadPool> MoEDoubleAccumulator::threadPool

◆ weights

const SharedMoEWeights* MoEDoubleAccumulator::weights = nullptr

The documentation for this struct was generated from the following file: