|
| const SharedMoEWeights & | shared_weights () const |
| SharedMoEWeights & | mutable_owned_weights () |
| void | reset_runtime_state () |
| void | copy_weights_from (const MoEDoubleAccumulator &src) |
| void | reset_profile () |
| template<typename Fn> |
| void | parallel_for_indices (int n, int min_parallel_n, Fn &&fn) |
| template<typename Fn> |
| void | parallel_for_indices (int n, Fn &&fn) |
| long long | total_weights () const |
| long long | single_expert_weights () const |
| long long | experts_total_weights () const |
| long long | backbone_weights () const |
| long long | runtime_topk_weights (int topk) const |
| void | init (const SharedMoEWeights *shared, const BenchConfig &cfg) |
| void | init (const BenchConfig &cfg) |
| void | fill_random (unsigned seed) |
| void | branch_forward_bd16_fast (const Branch &br, const float *HOT_RESTRICT in_planes, float *HOT_RESTRICT out, float *HOT_RESTRICT mid_plane, float *HOT_RESTRICT l1_accum) |
| void | branch_forward_with_scratch (int b, const float *in_planes, float *out, float *scratch0, float *scratch1) |
| void | branch_forward (int b, const float *in_planes, float *out) |
| void | rebuild_hidden_acc_from_flat (int e) |
| void | rebuild_hidden_acc_from_gap (int e) |
| void | rebuild_hidden_acc_from_pool2x2 (int e, bool max_pool) |
| float | global_proj_at (int oc, const float *g) const |
| void | top2_experts (const float *global, int &e0, int &e1, float &w0, float &w1) const |
| void | rebuild_expert_cache_from_mixer (int e) |
| void | full_rebuild_accumulators (const FactorizedInput &inp, const int *active_experts, int active_count) |
| | Performs a full forward pass of the model, discarding all cache.
|
| void | update_incremental (const FactorizedInput &cur, const FactorizedInput &prev, const int *dirty_branches, int dirty_count, const int *active_experts, int active_count) |
| | Performs an incremental network update by computing and applying only the differences.
|
| void | run_active_expert (int e, float out_wdl[3]) |
| | Computes the final hidden layer and WDL output for a single expert.
|
| void | run_top2_experts (int e0, int e1, float w0, float w1, float out_wdl[3]) |
| | Combines the output of the top 2 routed experts based on their routing weights.
|
|
| int | branchConvLayers = 3 |
| int | nThreads = 1 |
| int | minParallelDirtyHeads = 4 |
| int | minParallelActiveExperts = 3 |
| int | denseDirtySqThreshold = 16 |
| ExpertPoolMode | expertPoolMode = ExpertPoolMode::Pool2x2Avg |
| bool | routeSlowGlobals = false |
| const SharedMoEWeights * | weights = nullptr |
| std::shared_ptr< SharedMoEWeights > | ownedWeights {} |
| std::unique_ptr< PersistentThreadPool > | threadPool |
| std::array< float,(size_t) 12 *kMaxBranchDim *64 > | branchCache {} |
| std::array< float,(size_t) kMaxMixerOut *64 > | mixerLinearAccum {} |
| std::array< float,(size_t) kMaxMixerOut *64 > | mixerReluCache {} |
| std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExperts > | exPreAccum {} |
| std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExperts > | exReluCache {} |
| std::array< uint8_t, kMaxExperts > | exValid {} |
| std::array< std::array< float, kMaxExpertHidden >, kMaxExperts > | hiddenAcc {} |
| std::array< std::array< float, kMaxExpertBottleneck >, kMaxExperts > | exGapCache {} |
| std::array< std::array< float,(size_t) kMaxExpertBottleneck *kPool2x2Regions >, kMaxExperts > | exPool16Cache {} |
| std::array< float, kMaxGlobals > | oldGlobalV {} |
| std::array< float,(size_t) kMaxBranchDim *64 > | scratchT0 {} |
| std::array< float,(size_t) kMaxBranchDim *64 > | scratchT1 {} |
| std::array< float,(size_t) kMaxBranchDim *64 > | scratchNewBranch {} |
| std::array< std::array< float,(size_t) kMaxBranchDim *64 >, 12 > | scratchParallelBranch0 {} |
| std::array< std::array< float,(size_t) kMaxBranchDim *64 >, 12 > | scratchParallelBranch1 {} |
| std::array< float,(size_t) 12 *kMaxBranchDim *64 > | scratchDirtyBranches {} |
| std::array< float,(size_t) kMaxBranchDim *64 > | scratchBranchDelta {} |
| std::array< float,(size_t) kMaxBypass *64 > | scratchBypassDelta {} |
| std::array< float, kMaxMixerOut > | scratchGproj {} |
| std::array< float,(size_t) kMaxMixerOut *64 > | scratchDeltaRelu {} |
| std::array< float,(size_t) kMaxExpertBottleneck *64 > | scratchFlatDelta {} |
| std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExperts > | scratchParallelExpertDelta {} |
| std::array< float, kMaxExpertHidden > | scratchHidden {} |
| PhaseProfile | profile {} |
| bool | initialized = false |
Thread-local state for incremental, lightning-fast neural network inference.
The double accumulator pattern avoids running the full neural network on every position. Instead, it maintains a base state (base_ variables) corresponding to the parent position. When a move is played, it identifies which feature branches changed (e.g. only a Knight moved) and calculates the difference (dirty_branches). It then propagates only these differences through the mixer and into the active expert networks.
The update_incremental() function achieves <1 μs latency by leveraging this delta propagation.
| void MoEDoubleAccumulator::update_incremental |
( |
const FactorizedInput & | cur, |
|
|
const FactorizedInput & | prev, |
|
|
const int * | dirty_branches, |
|
|
int | dirty_count, |
|
|
const int * | active_experts, |
|
|
int | active_count ) |
|
inline |
Performs an incremental network update by computing and applying only the differences.
This is the heart of the engine's speed. By passing a list of dirty_branches, this function skips computing convolutions for pieces that haven't moved. It calculates the delta for the branches that did change, propagates that delta through the mixer layer, and updates the bottlenecks of the active experts.
- Parameters
-
| cur | The features for the new position. |
| prev | The features for the old (parent) position. |
| dirty_branches | Array of branch indices that changed. |
| dirty_count | Number of branches that changed. |
| active_experts | Array containing the indices of the currently active experts. |
| active_count | Number of active experts. |