Thread-local state for incremental, lightning-fast neural network inference. More...

#include <MoECacheModel.hpp>

Collaboration diagram for MoEDoubleAccumulator:

[legend]

Classes
struct	PhaseProfile

Public Member Functions
const SharedMoEWeights &	shared_weights () const
SharedMoEWeights &	mutable_owned_weights ()
void	reset_runtime_state ()
void	copy_weights_from (const MoEDoubleAccumulator &src)
void	reset_profile ()
template<typename Fn>
void	parallel_for_indices (int n, int min_parallel_n, Fn &&fn)
template<typename Fn>
void	parallel_for_indices (int n, Fn &&fn)
long long	total_weights () const
long long	single_expert_weights () const
long long	experts_total_weights () const
long long	backbone_weights () const
long long	runtime_topk_weights (int topk) const
void	init (const SharedMoEWeights *shared, const BenchConfig &cfg)
void	init (const BenchConfig &cfg)
void	fill_random (unsigned seed)
void	branch_forward_bd16_fast (const Branch &br, const float HOT_RESTRICT in_planes, float HOT_RESTRICT out, float HOT_RESTRICT mid_plane, float HOT_RESTRICT l1_accum)
void	branch_forward_with_scratch (int b, const float in_planes, float out, float scratch0, float scratch1)
void	branch_forward (int b, const float in_planes, float out)
void	rebuild_hidden_acc_from_flat (int e)
void	rebuild_hidden_acc_from_gap (int e)
void	rebuild_hidden_acc_from_pool2x2 (int e, bool max_pool)
float	global_proj_at (int oc, const float *g) const
void	top2_experts (const float *global, int &e0, int &e1, float &w0, float &w1) const
void	rebuild_expert_cache_from_mixer (int e)
void	full_rebuild_accumulators (const FactorizedInput &inp, const int *active_experts, int active_count)
	Performs a full forward pass of the model, discarding all cache.
void	update_incremental (const FactorizedInput &cur, const FactorizedInput &prev, const int dirty_branches, int dirty_count, const int active_experts, int active_count)
	Performs an incremental network update by computing and applying only the differences.
void	run_active_expert (int e, float out_wdl[3])
	Computes the final hidden layer and WDL output for a single expert.
void	run_top2_experts (int e0, int e1, float w0, float w1, float out_wdl[3])
	Combines the output of the top 2 routed experts based on their routing weights.

Static Public Member Functions
static void	validate_fixed_architecture (const BenchConfig &cfg)

Public Attributes
int	branchConvLayers = 3
int	nThreads = 1
int	minParallelDirtyHeads = 4
int	minParallelActiveExperts = 3
int	denseDirtySqThreshold = 16
ExpertPoolMode	expertPoolMode = ExpertPoolMode::Pool2x2Avg
bool	routeSlowGlobals = false
const SharedMoEWeights *	weights = nullptr
std::shared_ptr< SharedMoEWeights >	ownedWeights {}
std::unique_ptr< PersistentThreadPool >	threadPool
std::array< float,(size_t) 12 kMaxBranchDim 64 >	branchCache {}
std::array< float,(size_t) kMaxMixerOut *64 >	mixerLinearAccum {}
std::array< float,(size_t) kMaxMixerOut *64 >	mixerReluCache {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExperts >	exPreAccum {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExperts >	exReluCache {}
std::array< uint8_t, kMaxExperts >	exValid {}
std::array< std::array< float, kMaxExpertHidden >, kMaxExperts >	hiddenAcc {}
std::array< std::array< float, kMaxExpertBottleneck >, kMaxExperts >	exGapCache {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *kPool2x2Regions >, kMaxExperts >	exPool16Cache {}
std::array< float, kMaxGlobals >	oldGlobalV {}
std::array< float,(size_t) kMaxBranchDim *64 >	scratchT0 {}
std::array< float,(size_t) kMaxBranchDim *64 >	scratchT1 {}
std::array< float,(size_t) kMaxBranchDim *64 >	scratchNewBranch {}
std::array< std::array< float,(size_t) kMaxBranchDim *64 >, 12 >	scratchParallelBranch0 {}
std::array< std::array< float,(size_t) kMaxBranchDim *64 >, 12 >	scratchParallelBranch1 {}
std::array< float,(size_t) 12 kMaxBranchDim 64 >	scratchDirtyBranches {}
std::array< float,(size_t) kMaxBranchDim *64 >	scratchBranchDelta {}
std::array< float,(size_t) kMaxBypass *64 >	scratchBypassDelta {}
std::array< float, kMaxMixerOut >	scratchGproj {}
std::array< float,(size_t) kMaxMixerOut *64 >	scratchDeltaRelu {}
std::array< float,(size_t) kMaxExpertBottleneck *64 >	scratchFlatDelta {}
std::array< std::array< float,(size_t) kMaxExpertBottleneck *64 >, kMaxExperts >	scratchParallelExpertDelta {}
std::array< float, kMaxExpertHidden >	scratchHidden {}
PhaseProfile	profile {}
bool	initialized = false

Static Public Attributes
static constexpr int	bd = NET_BRANCH_DIM
static constexpr int	nf = NET_MIXER_OUT
static constexpr int	nBypass = NET_BYPASS
static constexpr int	nGlobals = NET_GLOBALS
static constexpr int	nExperts = NET_EXPERTS
static constexpr int	ebo = NET_EXPERT_BOTTLENECK
static constexpr int	eh = NET_EXPERT_HIDDEN

Detailed Description

Thread-local state for incremental, lightning-fast neural network inference.

The double accumulator pattern avoids running the full neural network on every position. Instead, it maintains a base state (base_ variables) corresponding to the parent position. When a move is played, it identifies which feature branches changed (e.g. only a Knight moved) and calculates the difference (dirty_branches). It then propagates only these differences through the mixer and into the active expert networks.

The update_incremental() function achieves <1 μs latency by leveraging this delta propagation.

Member Function Documentation

◆ backbone_weights()

long long MoEDoubleAccumulator::backbone_weights ( ) const

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ branch_forward()

void MoEDoubleAccumulator::branch_forward	(	int	b,
		const float *	in_planes,
		float *	out )

inline

Here is the call graph for this function:

◆ branch_forward_bd16_fast()

void MoEDoubleAccumulator::branch_forward_bd16_fast	(	const Branch &	br,
		const float *HOT_RESTRICT	in_planes,
		float *HOT_RESTRICT	out,
		float *HOT_RESTRICT	mid_plane,
		float *HOT_RESTRICT	l1_accum )

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ branch_forward_with_scratch()

void MoEDoubleAccumulator::branch_forward_with_scratch	(	int	b,
		const float *	in_planes,
		float *	out,
		float *	scratch0,
		float *	scratch1 )

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ copy_weights_from()

void MoEDoubleAccumulator::copy_weights_from ( const MoEDoubleAccumulator & src )

inline

Here is the call graph for this function:

◆ experts_total_weights()

long long MoEDoubleAccumulator::experts_total_weights ( ) const

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fill_random()

void MoEDoubleAccumulator::fill_random ( unsigned seed )

inline

Here is the call graph for this function:

◆ full_rebuild_accumulators()

void MoEDoubleAccumulator::full_rebuild_accumulators	(	const FactorizedInput &	inp,
		const int *	active_experts,
		int	active_count )

inline

Performs a full forward pass of the model, discarding all cache.

This is used for the very first evaluation of a game (where there is no previous state to incrementally update from), or when the position changes so drastically (e.g. >6 dirty branches) that an incremental update would be slower than a full pass.

Parameters

inp	The full spatial features of the current position.
active_experts	Array containing the indices of the experts chosen by the router.
active_count	The number of active experts (usually 2).

Here is the call graph for this function:

Here is the caller graph for this function:

◆ global_proj_at()

float MoEDoubleAccumulator::global_proj_at	(	int	oc,
		const float *	g ) const

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ init() [1/2]

void MoEDoubleAccumulator::init ( const BenchConfig & cfg )

inline

Here is the call graph for this function:

◆ init() [2/2]

void MoEDoubleAccumulator::init	(	const SharedMoEWeights *	shared,
		const BenchConfig &	cfg )

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutable_owned_weights()

SharedMoEWeights & MoEDoubleAccumulator::mutable_owned_weights ( )

inline

Here is the caller graph for this function:

◆ parallel_for_indices() [1/2]

template<typename Fn>

void MoEDoubleAccumulator::parallel_for_indices	(	int	n,
		Fn &&	fn )

inline

Here is the call graph for this function:

◆ parallel_for_indices() [2/2]

template<typename Fn>

void MoEDoubleAccumulator::parallel_for_indices	(	int	n,
		int	min_parallel_n,
		Fn &&	fn )

inline

Here is the caller graph for this function:

◆ rebuild_expert_cache_from_mixer()

void MoEDoubleAccumulator::rebuild_expert_cache_from_mixer ( int e )

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ rebuild_hidden_acc_from_flat()

void MoEDoubleAccumulator::rebuild_hidden_acc_from_flat ( int e )

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ rebuild_hidden_acc_from_gap()

void MoEDoubleAccumulator::rebuild_hidden_acc_from_gap ( int e )

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ rebuild_hidden_acc_from_pool2x2()

void MoEDoubleAccumulator::rebuild_hidden_acc_from_pool2x2	(	int	e,
		bool	max_pool )

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ reset_profile()

void MoEDoubleAccumulator::reset_profile ( )

inline

◆ reset_runtime_state()

void MoEDoubleAccumulator::reset_runtime_state ( )

inline

Here is the caller graph for this function:

◆ run_active_expert()

void MoEDoubleAccumulator::run_active_expert	(	int	e,
		float	out_wdl[3] )

inline

Computes the final hidden layer and WDL output for a single expert.

Takes the accumulated bottleneck state for the given expert, passes it through the expert's hidden layer (with ReLU), and multiplies by the final Win/Draw/Loss weights.

Parameters

e	The index of the expert to run.
out_wdl	A 3-element float array where the un-normalized WDL logits will be stored.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ run_top2_experts()

void MoEDoubleAccumulator::run_top2_experts	(	int	e0,
		int	e1,
		float	w0,
		float	w1,
		float	out_wdl[3] )

inline

Combines the output of the top 2 routed experts based on their routing weights.

Evaluates both experts independently and takes a weighted average of their Win/Draw/Loss logits using the probabilities assigned by the Router gate.

Parameters

e0	Index of the best expert.
e1	Index of the second-best expert.
w0	Routing weight (probability) for the best expert.
w1	Routing weight (probability) for the second-best expert.
out_wdl	Array to store the combined WDL logits.

Here is the call graph for this function:

◆ runtime_topk_weights()

long long MoEDoubleAccumulator::runtime_topk_weights ( int topk ) const

inline

Here is the call graph for this function:

◆ shared_weights()

const SharedMoEWeights & MoEDoubleAccumulator::shared_weights ( ) const

inline

Here is the caller graph for this function:

◆ single_expert_weights()

long long MoEDoubleAccumulator::single_expert_weights ( ) const

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ top2_experts()

void MoEDoubleAccumulator::top2_experts	(	const float *	global,
		int &	e0,
		int &	e1,
		float &	w0,
		float &	w1 ) const

inline

Here is the call graph for this function:

◆ total_weights()

long long MoEDoubleAccumulator::total_weights ( ) const

inline

Here is the call graph for this function:

Here is the caller graph for this function:

◆ update_incremental()

void MoEDoubleAccumulator::update_incremental	(	const FactorizedInput &	cur,
		const FactorizedInput &	prev,
		const int *	dirty_branches,
		int	dirty_count,
		const int *	active_experts,
		int	active_count )

inline

Performs an incremental network update by computing and applying only the differences.

This is the heart of the engine's speed. By passing a list of dirty_branches, this function skips computing convolutions for pieces that haven't moved. It calculates the delta for the branches that did change, propagates that delta through the mixer layer, and updates the bottlenecks of the active experts.

Parameters

cur	The features for the new position.
prev	The features for the old (parent) position.
dirty_branches	Array of branch indices that changed.
dirty_count	Number of branches that changed.
active_experts	Array containing the indices of the currently active experts.
active_count	Number of active experts.

Here is the call graph for this function:

◆ validate_fixed_architecture()

void MoEDoubleAccumulator::validate_fixed_architecture ( const BenchConfig & cfg )

inlinestatic

Here is the caller graph for this function:

Member Data Documentation

◆ bd

int MoEDoubleAccumulator::bd = NET_BRANCH_DIM

staticconstexpr

◆ branchCache

std::array<float, (size_t)12 * kMaxBranchDim * 64> MoEDoubleAccumulator::branchCache {}

◆ branchConvLayers

int MoEDoubleAccumulator::branchConvLayers = 3

◆ denseDirtySqThreshold

int MoEDoubleAccumulator::denseDirtySqThreshold = 16

◆ ebo

int MoEDoubleAccumulator::ebo = NET_EXPERT_BOTTLENECK

staticconstexpr

◆ eh

int MoEDoubleAccumulator::eh = NET_EXPERT_HIDDEN

staticconstexpr

◆ exGapCache

std::array<std::array<float, kMaxExpertBottleneck>, kMaxExperts> MoEDoubleAccumulator::exGapCache {}

◆ expertPoolMode

ExpertPoolMode MoEDoubleAccumulator::expertPoolMode = ExpertPoolMode::Pool2x2Avg

◆ exPool16Cache

std::array< std::array<float, (size_t)kMaxExpertBottleneck * kPool2x2Regions>, kMaxExperts> MoEDoubleAccumulator::exPool16Cache {}

◆ exPreAccum

std::array<std::array<float, (size_t)kMaxExpertBottleneck * 64>, kMaxExperts> MoEDoubleAccumulator::exPreAccum {}

◆ exReluCache

std::array<std::array<float, (size_t)kMaxExpertBottleneck * 64>, kMaxExperts> MoEDoubleAccumulator::exReluCache {}

◆ exValid

std::array<uint8_t, kMaxExperts> MoEDoubleAccumulator::exValid {}

◆ hiddenAcc

std::array<std::array<float, kMaxExpertHidden>, kMaxExperts> MoEDoubleAccumulator::hiddenAcc {}

◆ initialized

bool MoEDoubleAccumulator::initialized = false

◆ minParallelActiveExperts

int MoEDoubleAccumulator::minParallelActiveExperts = 3

◆ minParallelDirtyHeads

int MoEDoubleAccumulator::minParallelDirtyHeads = 4

◆ mixerLinearAccum

std::array<float, (size_t)kMaxMixerOut * 64> MoEDoubleAccumulator::mixerLinearAccum {}

◆ mixerReluCache

std::array<float, (size_t)kMaxMixerOut * 64> MoEDoubleAccumulator::mixerReluCache {}

◆ nBypass

int MoEDoubleAccumulator::nBypass = NET_BYPASS

staticconstexpr

◆ nExperts

int MoEDoubleAccumulator::nExperts = NET_EXPERTS

staticconstexpr

◆ nf

int MoEDoubleAccumulator::nf = NET_MIXER_OUT

staticconstexpr

◆ nGlobals

int MoEDoubleAccumulator::nGlobals = NET_GLOBALS

staticconstexpr

◆ nThreads

int MoEDoubleAccumulator::nThreads = 1

◆ oldGlobalV

std::array<float, kMaxGlobals> MoEDoubleAccumulator::oldGlobalV {}

◆ ownedWeights

std::shared_ptr<SharedMoEWeights> MoEDoubleAccumulator::ownedWeights {}

◆ profile

PhaseProfile MoEDoubleAccumulator::profile {}

◆ routeSlowGlobals

bool MoEDoubleAccumulator::routeSlowGlobals = false

◆ scratchBranchDelta

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchBranchDelta {}

◆ scratchBypassDelta

std::array<float, (size_t)kMaxBypass * 64> MoEDoubleAccumulator::scratchBypassDelta {}

◆ scratchDeltaRelu

std::array<float, (size_t)kMaxMixerOut * 64> MoEDoubleAccumulator::scratchDeltaRelu {}

◆ scratchDirtyBranches

std::array<float, (size_t)12 * kMaxBranchDim * 64> MoEDoubleAccumulator::scratchDirtyBranches {}

◆ scratchFlatDelta

std::array<float, (size_t)kMaxExpertBottleneck * 64> MoEDoubleAccumulator::scratchFlatDelta {}

◆ scratchGproj

std::array<float, kMaxMixerOut> MoEDoubleAccumulator::scratchGproj {}

◆ scratchHidden

std::array<float, kMaxExpertHidden> MoEDoubleAccumulator::scratchHidden {}

◆ scratchNewBranch

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchNewBranch {}

◆ scratchParallelBranch0

std::array<std::array<float, (size_t)kMaxBranchDim * 64>, 12> MoEDoubleAccumulator::scratchParallelBranch0 {}

◆ scratchParallelBranch1

std::array<std::array<float, (size_t)kMaxBranchDim * 64>, 12> MoEDoubleAccumulator::scratchParallelBranch1 {}

◆ scratchParallelExpertDelta

std::array<std::array<float, (size_t)kMaxExpertBottleneck * 64>, kMaxExperts> MoEDoubleAccumulator::scratchParallelExpertDelta {}

◆ scratchT0

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchT0 {}

◆ scratchT1

std::array<float, (size_t)kMaxBranchDim * 64> MoEDoubleAccumulator::scratchT1 {}

◆ threadPool

std::unique_ptr<PersistentThreadPool> MoEDoubleAccumulator::threadPool

◆ weights

const SharedMoEWeights* MoEDoubleAccumulator::weights = nullptr

The documentation for this struct was generated from the following file:

engine/src/eval/MoECacheModel.hpp

Classes

Public Member Functions

Static Public Member Functions

Public Attributes

Static Public Attributes

Detailed Description

Member Function Documentation

◆ backbone_weights()

◆ branch_forward()

◆ branch_forward_bd16_fast()

◆ branch_forward_with_scratch()

◆ copy_weights_from()

◆ experts_total_weights()

◆ fill_random()

◆ full_rebuild_accumulators()

◆ global_proj_at()

◆ init() [1/2]

◆ init() [2/2]

◆ mutable_owned_weights()

◆ parallel_for_indices() [1/2]

◆ parallel_for_indices() [2/2]

◆ rebuild_expert_cache_from_mixer()

◆ rebuild_hidden_acc_from_flat()

◆ rebuild_hidden_acc_from_gap()

◆ rebuild_hidden_acc_from_pool2x2()

◆ reset_profile()

◆ reset_runtime_state()

◆ run_active_expert()

◆ run_top2_experts()

◆ runtime_topk_weights()

◆ shared_weights()

◆ single_expert_weights()

◆ top2_experts()

◆ total_weights()

◆ update_incremental()

◆ validate_fixed_architecture()

Member Data Documentation

◆ bd

◆ branchCache

◆ branchConvLayers

◆ denseDirtySqThreshold

◆ ebo

◆ eh

◆ exGapCache

◆ expertPoolMode

◆ exPool16Cache

◆ exPreAccum

◆ exReluCache

◆ exValid

◆ hiddenAcc

◆ initialized

◆ minParallelActiveExperts

◆ minParallelDirtyHeads

◆ mixerLinearAccum

◆ mixerReluCache

◆ nBypass

◆ nExperts

◆ nf

◆ nGlobals

◆ nThreads

◆ oldGlobalV

◆ ownedWeights

◆ profile

◆ routeSlowGlobals

◆ scratchBranchDelta

◆ scratchBypassDelta

◆ scratchDeltaRelu

◆ scratchDirtyBranches

◆ scratchFlatDelta

◆ scratchGproj

◆ scratchHidden

◆ scratchNewBranch

◆ scratchParallelBranch0

◆ scratchParallelBranch1

◆ scratchParallelExpertDelta

◆ scratchT0

◆ scratchT1

◆ threadPool

◆ weights