|
1 | 1 | --- |
2 | 2 | --- |
3 | 3 |
|
| 4 | +@inproceedings{merge, |
| 5 | +title={Bridging Domains through Subspace-Aware Model Merging}, |
| 6 | +author={Levy Chaves and Chao Zhou and Rebekka Burkholz and Eduardo Valle and Andra Avila}, |
| 7 | +year={2026}, |
| 8 | +booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, |
| 9 | +img={model-merging.png}, |
| 10 | +} |
| 11 | + |
| 12 | +@inproceedings{sanyal2026games, |
| 13 | +title={Frequency-Based Hyperparameter Selection in Games}, |
| 14 | +author={Aniket Sanyal and Baraah A. M. Sidahmed and Rebekka Burkholz and Tatjana Chavdarova}, |
| 15 | +year={2026}, |
| 16 | +booktitle={International Conference on Artificial Intelligence and Statistics}, |
| 17 | +img={frequency-based-hyperparameter-selection.png}, |
| 18 | +url={https://arxiv.org/abs/2601.18409}, |
| 19 | +pdf={https://arxiv.org/pdf/2601.18409}, |
| 20 | +abstract={Learning in smooth games fundamentally differs from standard minimization due to rotational dynamics, which invalidate classical hyperparameter tuning strategies. Despite their practical importance, effective methods for tuning in games remain underexplored. A notable example is LookAhead (LA), which achieves strong empirical performance but introduces additional parameters that critically influence performance. We propose a principled approach to hyperparameter selection in games by leveraging frequency estimation of oscillatory dynamics. Specifically, we analyze oscillations both in continuous-time trajectories and through the spectrum of the discrete dynamics in the associated frequency-based space. Building on this analysis, we introduce \emph{Modal LookAhead (MoLA)}, an extension of LA that selects the hyperparameters adaptively to a given problem. We provide convergence guarantees and demonstrate in experiments that MoLA accelerates training in both purely rotational games and mixed regimes, all with minimal computational overhead.} |
| 21 | +} |
| 22 | + |
4 | 23 | @inproceedings{ |
5 | 24 | reddy2026boosting, |
6 | 25 | title={Boosting for Predictive Sufficiency}, |
@@ -55,10 +74,23 @@ @inproceedings{ |
55 | 74 | author={Celia Rubio-Madrigal and Rebekka Burkholz}, |
56 | 75 | booktitle={Women in Machine Learning Workshop @ NeurIPS}, |
57 | 76 | year={2025}, |
58 | | -url={https://openreview.net/forum?id=OHgWEMce80}, |
59 | | -pdf={https://openreview.net/pdf?id=OHgWEMce80}, |
| 77 | +url={https://arxiv.org/abs/2601.19449}, |
| 78 | +pdf={https://arxiv.org/pdf/2601.19449}, |
60 | 79 | img={fixed-aggregation-features.png}, |
61 | 80 | abstract={Graph neural networks (GNNs) are widely believed to excel at node representation learning through trainable neighborhood aggregations. We challenge this view by introducing Fixed Aggregation Features (FAFs), a training-free approach that transforms graph learning tasks into tabular problems. This simple shift enables the use of well-established tabular methods, offering strong interpretability and the flexibility to deploy diverse classifiers. Across 14 benchmarks, well-tuned multilayer perceptrons trained on FAFs rival or outperform state-of-the-art GNNs and graph transformers on 12 tasks -- often using only mean aggregation. The only exceptions are the Roman Empire and Minesweeper datasets, which typically require unusually deep GNNs. To explain the theoretical possibility of non-trainable aggregations, we connect our findings to Kolmogorov–Arnold representations and discuss when mean aggregation can be sufficient. In conclusion, our results call for (i) richer benchmarks benefiting from learning diverse neighborhood aggregations, (ii) strong tabular baselines as standard, and (iii) employing and advancing tabular models for graph data to gain new insights into related tasks.}, |
| 81 | +code={https://github.com/celrm/fixed-aggregation-features} |
| 82 | +} |
| 83 | + |
| 84 | +@inproceedings{ |
| 85 | +gadhikar2025optrot, |
| 86 | +title={OptRot: Mitigating Weight Outliers via Data-Free Rotations for Post-Training Quantization}, |
| 87 | +author={Advait Gadhikar and Riccardo Grazzi and James Hensman}, |
| 88 | +booktitle={Machine Learning for Systems @ NeurIPS}, |
| 89 | +year={2025}, |
| 90 | +img={optrot.png}, |
| 91 | +url={https://openreview.net/forum?id=4uwRaBhjHY}, |
| 92 | +pdf={https://openreview.net/pdf?id=4uwRaBhjHY}, |
| 93 | +abstract={We introduce OptRot, a data-free preprocessing method to learn fusible rotations for post-training quantization of language models. OptRot reduces weight outliers by finding rotations which minimize the element-wise fourth power of the rotated weights. We show how reducing weight outliers can provably improve weight quantization performance and how OptRot rotations can outperform both Hadamard rotations and rotations learned by the data-dependent method SpinQuant.}, |
62 | 94 | } |
63 | 95 |
|
64 | 96 | @inproceedings{ zhou2025payattentionsmallweights, |
@@ -106,6 +138,18 @@ @inproceedings{jacobs2025mirror |
106 | 138 | abstract={Implicit bias plays an important role in explaining how overparameterized models generalize well. Explicit regularization like weight decay is often employed in addition to prevent overfitting. While both concepts have been studied separately, in practice, they often act in tandem. Understanding their interplay is key to controlling the shape and strength of implicit bias, as it can be modified by explicit regularization. To this end, we incorporate explicit regularization into the mirror flow framework and analyze its lasting effects on the geometry of the training dynamics, covering three distinct effects: positional bias, type of bias, and range shrinking. Our analytical approach encompasses a broad class of problems, including sparse coding, matrix sensing, single-layer attention, and LoRA, for which we demonstrate the utility of our insights. To exploit the lasting effect of regularization and highlight the potential benefit of dynamic weight decay schedules, we propose to switch off weight decay during training, which can improve generalization, as we demonstrate in experiments.}, |
107 | 139 | } |
108 | 140 |
|
| 141 | +@inproceedings{ |
| 142 | +gadhikar2025attention, |
| 143 | +title={Attention Is All You Need For Mixture-of-Depths Routing}, |
| 144 | +author={Advait Gadhikar and Souptik Kumar Majumdar and Niclas Popp and Piyapat Saranrittichai and Martin Rapp and Lukas Schott}, |
| 145 | +booktitle={Workshop on Scalable Optimization for Efficient and Adaptive Foundation Models @ ICLR}, |
| 146 | +year={2025}, |
| 147 | +img={mod-routing.png}, |
| 148 | +url={https://openreview.net/forum?id=1uDP4ld3eZ}, |
| 149 | +pdf={https://openreview.net/pdf?id=1uDP4ld3eZ}, |
| 150 | +abstract={Advancements in deep learning are driven by training models with increasingly larger numbers of parameters, which in turn heightens the computational demands. To address this issue, Mixture-of-Depths (MoD) models have been proposed to dynamically focus computations on the most relevant parts of the inputs, thereby enabling the deployment of large-parameter models with high efficiency during inference and training. However, conventional MoD models employ additional network layers specifically for the routing which are difficult to train, and add complexity to the model. In this paper, we introduce a novel attention-based routing mechanism A-MoD that leverages the existing attention map of the preceding layer for routing decisions within the current layer. Compared to standard routing, A-MoD allows for more efficient training as it introduces no additional trainable parameters and can be easily adapted from pre-trained transformer models. Furthermore, it can increase the performance of the MoD model. For instance, we observe up to 2% higher accuracy on ImageNet compared to standard routing and isoFLOP ViT baselines.} |
| 151 | +} |
| 152 | + |
109 | 153 | @inproceedings{ |
110 | 154 | jacobs2025mask, |
111 | 155 | title={Mask in the Mirror: Implicit Sparsification}, |
|
0 commit comments