|
| | model = ChessNetFactorizedMoE(expert_bottleneck=16, mixer_out=512) |
| list | planes_list = [torch.randn(1, in_ch, 8, 8) for in_ch in ChessNetFactorizedMoE.PLANES_PER_TYPE] |
| | bypass = torch.randn(1, 12, 8, 8) |
| | global_v = torch.randn(1, 21) |
| | wdl = model(planes_list, bypass, global_v) |
| | total_params = sum(p.numel() for p in model.parameters()) |
| | branches_params = sum(p.numel() for n, p in model.named_parameters() if "branches" in n) |
| | stem_global_params = sum(p.numel() for n, p in model.named_parameters() if "stem_global" in n) |
| | pointwise_mixer_params = sum(p.numel() for n, p in model.named_parameters() if "pointwise_mixer" in n) |
| | backbone_params = sum(p.numel() for n, p in model.named_parameters() if "branches" in n or "pointwise_mixer" in n or "stem_global" in n) |
| | expert_params = sum(p.numel() for n, p in model.named_parameters() if "experts" in n) |
@file model.py
@brief Lightweight MoE Model for Chess Evaluation (Factorized Cache Architecture).
RULES FOR C++ PERFORMANCE:
1. NO 3x3 CONVOLUTIONS AFTER THE MIXER. (Preserves spatial caching).
2. Dense layers must be protected by a 1x1 bottleneck to minimize Flat size.