The Polar Express for Muon, Visualized - The Polar Express for Muon, Visualized

import numpy as np
import altair as alt

# ============================================================================
# SHARED FUNCTIONS AND CONSTANTS
# ============================================================================

# NS coefficients for all degrees
NS_COEFFS = {}
MAX_DEGREE = 29
for d in range(1, MAX_DEGREE+1, 2):
    n_var = (d+1)//2
    A = np.zeros((n_var,n_var))
    b = np.zeros(n_var)
    b[0] = 1
    for i in range(n_var):
        for j in range(n_var):
            e = 2*j + 1
            if i > e:
                A[i, j] = 0.0
            else:
                A[i, j] = np.prod(np.arange(e, e - i, -1))
    x = np.linalg.solve(A, b)
    NS_COEFFS[d] = x

# NS degree 5 coefficients
NS_COEFFS_5 = [1.875, -1.25, 0.375]

def evaluate_polynomial(coeffs, x):
    """Evaluate odd polynomial using Horner's method."""
    x2 = x * x
    result = 0.0
    for c in reversed(coeffs):
        result = result * x2 + c
    return x * result

def optimal_quintic(l, u=1):
    """Compute optimal equioscillating quintic polynomial for interval [l, u]"""
    if 1 - 5e-6 <= l / u:
        return (15/8) /u, (-10/8) /(u**3), (3/8) /(u**5), 0
    
    q = (3*l + u) / 4
    r = (l + 3*u) / 4
    
    E, old_E = np.inf, None
    while not old_E or abs(old_E - E) > 1e-15:
        old_E = E
        LHS = np.array([
            [l, l**3, l**5, 1],
            [q, q**3, q**5, -1],
            [r, r**3, r**5, 1],
            [u, u**3, u**5, -1],
        ])
        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
        q, r = np.sqrt((-3*b + np.array([-1, 1]) * np.sqrt(9*b**2 - 20*a*c)) / (10*c))
        E = min(E, .999)
    return a, b, c, E

def compose_polynomial(coeffs, x_vals, num_iterations):
    """Apply polynomial composition num_iterations times"""
    result = x_vals.copy()
    for _ in range(num_iterations):
        result = np.array([evaluate_polynomial(coeffs, xi) for xi in result])
    return result

def compute_polar_express(l, num_iterations):
    """Compute optimal multi-level composition"""
    current_l = l
    current_u = 1.0
    functions = []
    errors = []
    
    for i in range(num_iterations):
        result = optimal_quintic(current_l, current_u)
        a, b, c, E = result
        functions.append({'a': a, 'b': b, 'c': c})
        errors.append({'iteration': i + 1, 'error': E})
        current_l = 1 - E
        current_u = 1 + E
    
    return functions, errors

def compute_ns_errors(l, max_iterations):
    """Compute NS error at each iteration"""
    errors = []
    current_l = l
    current_u = 1.0
    
    for iteration in range(1, max_iterations + 1):
        p_l = evaluate_polynomial(NS_COEFFS_5, current_l)
        p_u = evaluate_polynomial(NS_COEFFS_5, current_u)
        error = 1 - p_l
        errors.append({
            'iteration': int(iteration),
            'error': float(max(error, 1e-100)),
            'method': 'NS'
        })
        current_l = p_l
        current_u = p_u
    
    return errors

def compute_pe_errors(l, max_iterations):
    """Compute PE error at each iteration"""
    errors = []
    current_l = l
    current_u = 1.0
    
    for iteration in range(1, max_iterations + 1):
        a, b, c, E = optimal_quintic(current_l, current_u)
        E = float(max(E, 1e-100))
        errors.append({
            'iteration': int(iteration),
            'error': E,
            'method': 'PE'
        })
        current_l = 1 - E
        current_u = 1 + E
    
    return errors

# Common data arrays
x_vals = np.linspace(0, 1.1, 200)
slider_vals = np.round(np.arange(0.0, 1.01, 0.01), 2)
l_vals = slider_vals ** 2

# Helper functions for creating common chart elements
def create_l_slider_param(value=0.32):
    """Create the l slider parameter"""
    l_slider = alt.binding_range(min=0.0, max=1.0, step=0.01, name='ℓ: ')
    return alt.param(name='l_slider_param', value=value, bind=l_slider)

def create_interval_rect(y2=1.1):
    """Create rectangle highlighting interval [l, u]"""
    rect_data = [{'x': 0.1, 'y': 0, 'x2': 1.0, 'y2': y2}]
    return alt.Chart(alt.Data(values=rect_data)).transform_calculate(
        l_param='l_slider_param * l_slider_param',
        x='l_slider_param * l_slider_param',
        x2='1.0'
    ).mark_rect(opacity=0.1, color='gray').encode(
        x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q'
    )

def create_v_line_l(y2=1.1):
    """Create vertical line at x=l"""
    v_line_l_data = [{'x': 0.1, 'y': 0, 'x2': 0.1, 'y2': y2}]
    return alt.Chart(alt.Data(values=v_line_l_data)).mark_line(
        color='black'
    ).transform_calculate(
        l_param='l_slider_param * l_slider_param',
        x='l_slider_param * l_slider_param',
        x2='l_slider_param * l_slider_param'
    ).encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

def create_v_line_1(y2=1.1):
    """Create vertical line at x=1 (u)"""
    v_line_1_data = [{'x': 1, 'y': 0, 'x2': 1, 'y2': y2}]
    return alt.Chart(alt.Data(values=v_line_1_data)).mark_line(
        color='black'
    ).encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

def create_h_line_1(x2=1.1):
    """Create horizontal line at y=1"""
    h_line_1_data = [{'x': 0, 'y': 1, 'x2': x2, 'y2': 1}]
    return alt.Chart(alt.Data(values=h_line_1_data)).mark_line(
        strokeDash=[2, 2], color='black'
    ).encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

def create_labels_l(y=1.08):
    """Create labels for x=ℓ positioned to the left"""
    label_l_data = [{'x': 0.1, 'y': y}]
    
    label_prefix = alt.Chart(alt.Data(values=label_l_data)).transform_calculate(
        l_param='l_slider_param * l_slider_param',
        x='l_slider_param * l_slider_param'
    ).mark_text(
        align='right', fontSize=12, font='KaTeX_Math', dx=-6, dy=-5
    ).encode(x='x:Q', y='y:Q', text=alt.value('x='))
    
    label_ell = alt.Chart(alt.Data(values=label_l_data)).transform_calculate(
        l_param='l_slider_param * l_slider_param',
        x='l_slider_param * l_slider_param'
    ).mark_text(
        align='right', fontSize=12, font='KaTeX_Main', dx=-2, dy=-5
    ).encode(x='x:Q', y='y:Q', text=alt.value('ℓ'))
    
    return label_prefix + label_ell

def create_label_u(y=1.08):
    """Create label for x=u positioned to the right"""
    label_u_data = [{'x': 1.0, 'y': y, 'text': 'x=u'}]
    return alt.Chart(alt.Data(values=label_u_data)).mark_text(
        align='left', fontSize=12, font='KaTeX_Math', dx=2, dy=-5
    ).encode(x='x:Q', y='y:Q', text='text:N')

print("Setup cell created successfully!")

Setup cell created successfully!

Recently, a new optimizer called Muon has been proposed for training deep learning models with matrix-valued weight matrices (Jordan et al. (2024)). This optimizer seems to be particularly effective for training transformer-based LLMs, consistently outperforming the previous state of the art adamW optimizer on a variety of benchmarks.

A key step in Muon (and really the big change from past optimizers) is to replace the gradient of the loss with respect to each weight matrix with its polar decomposition (see e.g. the PyTorch implementation).^[1] Mathematically, the polar decomposition of a matrix $\vec{G}$ is

\polar(\vec{G}) := \vec{U} \vec{V}^\T ,\qquad \text{where } \vec{G} = \vec{U} \vec{\Sigma} \vec{V}^\T \text{ is the SVD of } \vec{G}.

(1)

Of course, this definition suggests that we can simply use something like torch.svd, and this does produce good training behavior in terms of iterations. However, the SVD is horrifyingly slow on GPU 🐌, so an SVD-based implementation of Muon would be intractable. Instead, implementations of Muon use approximations to the polar decomposition that are computed using only matrix-matrix multiplications and additions.

Recently, Amsel et al. (2026) introduced the PolarExpress method, a principled approach to constructing an approximation to the polar decomposition within Muon. This approach has gained attention in part since the performance of Muon with PolarExpress seems to be uniformly better than with previous methods such as NS-5. More generally, PolarExpress is interesting because it shows how tools from fields like numerical analysis and approximation theory can be used to improve algorithms used in machine learning.

Polynomial approximation¶

Matrix-matrix multiplication and addition are very fast ⚡ on GPUs (these operations are exactly what GPUs are designed for)! As such, we’d like to compute the polar decomposition using only matrix-matrix multiplication and addition.

Note that the matrices

\vec{G}(\vec{G}^\T \vec{G}), \vec{G}(\vec{G}^\T \vec{G})^2, \ldots

(2)

all have the same shape as $\vec{G}$ and can be computed using matrix-matrix multiplications with $\vec{G}$ . This suggests that we could try build an approximation to the polar decomposition of the form

\polar(\vec{G}) \approx a_0 \vec{G} + a_1 \vec{G}(\vec{G}^\T\vec{G}) + a_2 \vec{G}(\vec{G}^\T\vec{G})^2 + \cdots + a_q \vec{G}(\vec{G}^\T\vec{G})^q =: p(\vec{G}).

(3)

Connection to scalar approximation¶

The definition of $p(\vec{G})$ above (and the title of the section) suggests a connection to polynomials. Indeed, with a bit of linear algebra, one verifies that

p(\vec{G}) = \vec{U}p(\vec{\Sigma}) \vec{V}^\T,

(4)

where

p(x) = a_0 x + a_1 x^3 + \cdots + a_q x^{2q+1}

(5)

and $p(\vec{\Sigma})$ is the diagonal matrix whose diagonal entries are $p(\sigma_i)$ (notice that we are only allowed to use odd powers, since even powers wouldn’t be the right shape).

By the unitary invariance of the spectral norm, we see that the error of a polynomial approximation to the polar decomposition is exactly the error of the scalar polynomial approximation:

\begin{align} \| \polar(\vec{G}) - p(\vec{G}) \|_{\mathrm{op}} &= \| \vec{U} \vec{I} \vec{V}^\T - \vec{U} p(\vec{\Sigma}) \vec{V}^\T \|_{\mathrm{op}} \\&= \| \vec{I} - p(\vec{\Sigma}) \|_{\mathrm{op}} \\&= \max_i | 1 - p(\sigma_i) |. \end{align}

(6)

Thus, we have reduced the problem of approximating the polar decomposition on GPUs to a scalar-valued problem in approximation theory: approximating the function $f(x) = 1$ on the singular values of $\vec{G}$ using an odd polynomial.

Of course, since we don’t know the singular values of $\vec{G}$ , we might instead try to approximate the function $f(x) = 1$ on and interval $[\ell,u]$ where $\ell$ and $u$ are chosen so that $\sigma_i\in[\ell,u]$ for all singular values $\sigma_i$ of $\vec{G}$ . Since $\cup_i \{\sigma_i\} \subset [\ell,u]$ ,

\| \polar(\vec{G}) - p(\vec{G}) \|_2 \leq \max_{x\in[\ell,u]} | 1 - p(x) |.

(7)

The above inequality is sharp. For any $\ell$ , $u$ , and $p(x)$ , there exists a matrix $\vec{G}$ with singular values in $[\ell,u]$ such that the error of the approximation is exactly $\max_{x\in[\ell,u]} | 1 - p(x) |$ .

First approach: Newton-Schulz¶

How should we choose the coefficients $a_0,a_1,\ldots,a_q$ of our polynomial approximation $p(x)$ to the sign function?

A simple approach is to choose the coefficients so that the approximation matches properties of the sign function. For instance, we might specify that we will choose an approximation with $q=1$ , so that $p(x) = a_0 x + a_1 x^3$ . Since there are two free parameters we can enforce the conditions $p(u) = 1$ and $p'(u) = 0$ . When $u=1$ , this gives us the system of equations

\left\{ \begin{aligned} a_0 + a_1 &= 1 \\ a_0 + 3a_1 &= 0 \end{aligned}\right.

(8)

which has solution $a_0 = 3/2$ and $a_1 = -1/2$ .

If we allow a higher degree polynomial, we can enforce that higher derivatives are also zero. For instance, if we choose $p(x) = a_0 x + a_1 x^3 + a_2 x^5$ , corresponding to $q=2$ , then we can enforce that $p(u) = 1$ , $p'(u) = 0$ , and $p''(u) = 0$ . When $u=1$ this gives the polynomial:

x\mapsto \frac{15}{8} x - \frac{10}{8} x^3 + \frac{3}{8} x^5.

(9)

We can construct a polynomial in this way for any $q$ by enforcing that $p(u) = 1$ and $p^{(i)}(u) = 0$ for $i=1,\ldots,q$ . These polynomials are commonly called the Newton-Shulz polynomials of degree $d$ .

The following figure shows how increasing the degree $d = 2q+1$ of the approximation affects the error of the approximation on the interval $[\ell,1]$ . As the degree increases, the error of the approximation decreases. However, the convergence is not very good, particularly when $\ell$ is small.

# Generate data for all degrees
degrees = list(range(1, MAX_DEGREE+1, 2))

poly_data = []
for degree in degrees:
    coeffs = NS_COEFFS[degree]
    for x in x_vals:
        y = evaluate_polynomial(coeffs, x)
        poly_data.append({'x': float(x), 'y': float(y), 'degree': int(degree)})

# Pre-compute p(l) for horizontal lines
h_line_data = []
for degree in degrees:
    coeffs = NS_COEFFS[degree]
    for slider_val, l in zip(slider_vals, l_vals):
        p_l = evaluate_polynomial(coeffs, l)
        h_line_data.append({
            'x': 0.0, 'y': float(p_l), 'x2': 1.1, 'y2': float(p_l), 
            'degree': int(degree), 'l_slider': float(slider_val)
        })

# Generate error data
error_data = []
for degree in degrees:
    coeffs = NS_COEFFS[degree]
    for slider_val, l in zip(slider_vals, l_vals):
        p_l = evaluate_polynomial(coeffs, l)
        error = abs(1 - p_l)
        error_data.append({'degree': int(degree), 'error': float(error), 'l_slider': float(slider_val)})

# Parameters
l_slider_param = create_l_slider_param()
degree_slider = alt.binding_range(min=1, max=MAX_DEGREE, step=2, name='d: ')
degree_param = alt.param(name='degree_param', value=5, bind=degree_slider)

# Polynomial plot
poly_line = alt.Chart(alt.Data(values=poly_data)).transform_filter(
    alt.datum.degree == degree_param
).mark_line(color='#1d4ed8', strokeWidth=2).encode(
    x=alt.X('x:Q', title='x', scale=alt.Scale(domain=[0, 1.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('y:Q', title='p(x)', scale=alt.Scale(domain=[0, 1.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

# Horizontal line at y=p(l)
h_line_pl = alt.Chart(alt.Data(values=h_line_data)).transform_filter(
    alt.datum.degree == degree_param
).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_line(strokeDash=[6, 2], color='#1d4ed8').encode(
    x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q'
)

poly_chart = (create_interval_rect() + poly_line + create_v_line_l() + create_v_line_1() + 
              create_h_line_1() + h_line_pl + create_labels_l() + create_label_u()).properties(
    width=500, height=300, title={'text': {'expr': '"Newton-Schulz Polynomial (d = " + degree_param + ")"'}}
)

# Error plot
error_bg = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_line(color='gray', clip=True).encode(
    x=alt.X('degree:Q', title='degree', scale=alt.Scale(domain=[1, MAX_DEGREE]), axis=alt.Axis(grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('error:Q', title='error', scale=alt.Scale(type='log', domain=[1e-2, 2]), axis=alt.Axis(values=[.01,.1,1], grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

error_active = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.degree <= degree_param
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(
    x=alt.X('degree:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-3, 2]))
)

error_points = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.degree <= degree_param
).mark_circle(color='#1d4ed8', opacity=1, size=30, clip=True).encode(
    x=alt.X('degree:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-3, 2]))
)

error_chart = (error_bg + error_active + error_points).properties(
    width=200, height=300, title='Error vs Degree'
)

chart = alt.hconcat(poly_chart).add_params(l_slider_param, degree_param)
chart

Composition¶

To efficiently compute a high-degree polynomial, we can use an iterative procedure

\vec{X}_{t} = p(\vec{X}_{t-1}), \qquad \vec{X}_0 = \vec{G},

(10)

where $p(x)$ is a low degree polynomial (such as the ones we derived above). This iteration corresponds to an approximation

\vec{X}_T = p_T(\vec{G}),\quad\text{where}\quad p_T = \underbrace{p \circ p \circ \cdots \circ p}_{T \text{ times}}.

(11)

Owing to its compositional nature, $p_T(x)$ is of degree $d^T$ , where $d$ is the degree of $p(x)$ . However, using (10), we can compute $p_T(\vec{G})$ using only $O(dT)$ matrix multiplications. In other words, we can efficiently construct certain polynomials of exponentially high degree!

Critically, since the degree $d=5$ Newton-Shulz polynomial $p(x)$ satisfies from (9) $p'(u) = p''(u) = 0$ , we expect fixed point iteration to converge cubically!

Let’s see how the error of the approximation changes as we increase the number of iterations $T$ .

# Generate composition data
max_iterations = 5
comp_data = []
for slider_val, l in zip(slider_vals, l_vals):
    for iteration in range(max_iterations + 1):
        composed = compose_polynomial(NS_COEFFS_5, x_vals, iteration)
        for x, y in zip(x_vals, composed):
            comp_data.append({
                'x': float(x), 'y': float(y), 
                'iteration': int(iteration), 'l_slider': float(slider_val)
            })

# Pre-compute p^(n)(l) for horizontal line
h_line_pl_data = []
for slider_val, l in zip(slider_vals, l_vals):
    for iteration in range(max_iterations + 1):
        p_l = l
        for _ in range(iteration):
            p_l = evaluate_polynomial(NS_COEFFS_5, p_l)
        h_line_pl_data.append({
            'x': 0.0, 'y': float(p_l), 'x2': 1.1, 'y2': float(p_l),
            'iteration': int(iteration), 'l_slider': float(slider_val)
        })

# Generate error data
error_data = []
for slider_val, l in zip(slider_vals, l_vals):
    current_l = l
    for iteration in range(max_iterations + 1):
        p_l = evaluate_polynomial(NS_COEFFS_5, current_l)
        error = 1 - p_l
        error_data.append({
            'iteration': int(iteration), 
            'error': float(max(error, 1e-15)), 
            'l_slider': float(slider_val)
        })
        current_l = 1 - error

# Parameters
l_slider_param = create_l_slider_param()
iter_slider = alt.binding_range(min=0, max=5, step=1, name='T: ')
iter_param = alt.param(name='iter_param', value=2, bind=iter_slider)

# Composition plot
comp_line = alt.Chart(alt.Data(values=comp_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.iteration == iter_param
).mark_line(color='#1d4ed8', strokeWidth=2).encode(
    x=alt.X('x:Q', title='x', scale=alt.Scale(domain=[0, 1.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('y:Q', title='p(x)', scale=alt.Scale(domain=[0, 1.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

h_line_pl = alt.Chart(alt.Data(values=h_line_pl_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.iteration == iter_param
).mark_line(strokeDash=[6, 2], color='#1d4ed8').encode(
    x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q'
)

comp_chart = (create_interval_rect() + comp_line + create_v_line_l() + create_v_line_1() + 
              create_h_line_1() + h_line_pl + create_labels_l() + create_label_u()).properties(
    width=500, height=300, title={'text': {'expr': '"Newton-Schulz Composition (T = " + iter_param + ")"'}}
)

# Error plot
error_bg = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_line(color='gray', clip=True).encode(
    x=alt.X('iteration:Q', title='iteration', scale=alt.Scale(domain=[0, 5]), axis=alt.Axis(grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('error:Q', title='error', scale=alt.Scale(type='log', domain=[1e-2, 2]), axis=alt.Axis(values=[.01,.1,1], grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

error_active = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.iteration <= iter_param
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(
    x=alt.X('iteration:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-3, 2]))
)

error_points = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.iteration <= iter_param
).mark_circle(color='#1d4ed8', opacity=1, size=30, clip=True).encode(
    x=alt.X('iteration:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-3, 2]))
)

error_chart = (error_bg + error_active + error_points).properties(
    width=200, height=300, title='Error vs Iterations'
)

chart = alt.hconcat(comp_chart).add_params(l_slider_param, iter_param)
chart

This time, we see convergence is very fast (the convergence is faster than exponential, since we get something decreasing faster than a line on a log-plot)! The NS-5 method used in some variants of Muon, uses the iterative procedure (11) to compute this approximation.

Polar Express: Optimal approximation¶

It’s natural to ask whether we can we do better than Newton-Schulz. Towards this end, several modifications have been proposed for use within Muon (Jordan et al. (2024),Cesista et al. (2025)). These changes eek out a bit of performance, but ultimately are based on heuristics and trial-and-error.

PolarExpress offers a principled approach.

With what we’ve seen so far, it’s actually pretty simple to describe PolarExpress.

Note that the bound from (7) suggests that we can try to find the best possible polynomial approximation to 1 on $[\ell,u]$ . In particular, we can try to solve

\min_{p\in\mathbb{P}_d^{\mathrm{odd}}} \max_{x \in [\ell,u]} |1 - p(x)|,

(12)

where

\mathbb{P}_d^{\mathrm{odd}} := \{ \text{odd polynomials of degree at most } d \}.

(13)

Approximation theorists love best approximation, and have developed lots of tools for solving problems like the one above. This particular problem can be solved using a variant of the Remez algorithm.

For simplicity, let’s consider the case $d=5$ and see what the optimal approximation on $[\ell,u]$ looks like.

# Generate polynomial data
poly_data = []
h_line_ns_lower_data = []
h_line_ns_upper_data = []
h_line_opt_lower_data = []
h_line_opt_upper_data = []

for slider_val, l in zip(slider_vals, l_vals):
    # NS polynomial
    for x in x_vals:
        y_ns = evaluate_polynomial(NS_COEFFS_5, x)
        poly_data.append({'x': float(x), 'y': float(y_ns), 'method': 'NS-5', 'l_slider': float(slider_val)})
    
    # Optimal polynomial
    opt_coeffs = optimal_quintic(l)[:3]
    for x in x_vals:
        y_opt = evaluate_polynomial(opt_coeffs, x)
        poly_data.append({'x': float(x), 'y': float(y_opt), 'method': 'Optimal', 'l_slider': float(slider_val)})
    
    # NS error bounds
    p_l_ns = evaluate_polynomial(NS_COEFFS_5, l)
    error_ns = abs(1 - p_l_ns)
    h_line_ns_lower_data.append({
        'x': 0.0, 'y': float(1 - error_ns), 'x2': 1.1, 'y2': float(1 - error_ns), 'l_slider': float(slider_val)
    })
    h_line_ns_upper_data.append({
        'x': 0.0, 'y': float(1 + error_ns), 'x2': 1.1, 'y2': float(1 + error_ns), 'l_slider': float(slider_val)
    })
    
    # Optimal error bounds
    error_opt = optimal_quintic(l)[3]
    h_line_opt_lower_data.append({
        'x': 0.0, 'y': float(1 - error_opt), 'x2': 1.1, 'y2': float(1 - error_opt), 'l_slider': float(slider_val)
    })
    h_line_opt_upper_data.append({
        'x': 0.0, 'y': float(1 + error_opt), 'x2': 1.1, 'y2': float(1 + error_opt), 'l_slider': float(slider_val)
    })

# Generate error vs l data
error_l_data = []
for slider_val, l in zip(slider_vals, l_vals):
    p_l_ns = evaluate_polynomial(NS_COEFFS_5, l)
    error_ns = abs(1 - p_l_ns)
    error_l_data.append({'l': float(l), 'error': float(error_ns), 'method': 'NS-5'})
    
    error_opt = optimal_quintic(l)[3]
    error_l_data.append({'l': float(l), 'error': float(error_opt), 'method': 'Optimal'})

# Parameters
l_slider_param = create_l_slider_param()

# Polynomial plots
poly_ns = alt.Chart(alt.Data(values=poly_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.method == 'NS-5'
).mark_line(color='lightgrey', strokeWidth=2, clip=True).encode(
    x=alt.X('x:Q', title='x', scale=alt.Scale(domain=[0, 1.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('y:Q', title='p(x)', scale=alt.Scale(domain=[0, 2]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

poly_opt = alt.Chart(alt.Data(values=poly_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.method == 'Optimal'
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(x=alt.X('x:Q'), y=alt.Y('y:Q'))

# Horizontal error lines
def create_h_line(data, color):
    return alt.Chart(alt.Data(values=data)).transform_calculate(
        l_slider_rounded='round(l_slider_param * 100) / 100'
    ).transform_filter(
        alt.datum.l_slider == alt.datum.l_slider_rounded
    ).mark_line(strokeDash=[6, 2], color=color).encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

h_line_ns_lower = create_h_line(h_line_ns_lower_data, 'lightgrey')
h_line_ns_upper = create_h_line(h_line_ns_upper_data, 'lightgrey')
h_line_opt_lower = create_h_line(h_line_opt_lower_data, '#1d4ed8')
h_line_opt_upper = create_h_line(h_line_opt_upper_data, '#1d4ed8')

poly_chart = (create_interval_rect(y2=2) + poly_ns + poly_opt + create_v_line_l(y2=2) + 
              create_v_line_1(y2=2) + create_h_line_1() + h_line_ns_lower + h_line_ns_upper + 
              h_line_opt_lower + h_line_opt_upper + create_labels_l(y=1.96) + create_label_u(y=1.96)).properties(
    width=500, height=300, title='Newton-Shulz vs Optimal Quintic'
)

# Error plot
error_ns_line = alt.Chart(alt.Data(values=error_l_data)).transform_filter(
    alt.datum.method == 'NS-5'
).mark_line(color='lightgrey', strokeWidth=2, clip=True).encode(
    x=alt.X('l:Q', title='ℓ', scale=alt.Scale(domain=[0.0, 1.0]), axis=alt.Axis(grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Main', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('error:Q', title='error', scale=alt.Scale(type='log', domain=[1e-2, 2]), axis=alt.Axis(values=[.01,.1,1], grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

error_opt_line = alt.Chart(alt.Data(values=error_l_data)).transform_filter(
    alt.datum.method == 'Optimal'
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(
    x=alt.X('l:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-3, 2]))
)

v_line_l_err_data = [{'x': 0.1, 'y': 1e-2, 'x2': 0.1, 'y2': 2}]
v_line_l_err = alt.Chart(alt.Data(values=v_line_l_err_data)).mark_line(
    color='black'
).transform_calculate(
    l_param='l_slider_param * l_slider_param',
    x='l_slider_param * l_slider_param',
    x2='l_slider_param * l_slider_param'
).encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

error_chart = (error_ns_line + error_opt_line + v_line_l_err).properties(
    width=200, height=300, title='Error vs ℓ'
)

chart = alt.hconcat(poly_chart).add_params(l_slider_param)
chart

Notice that when $\ell=u$ , the optimal polynomial and the degree 5 Newton-Shulz polynomial are identical. However, as $\ell$ decreases, the optimal polynomial starts to deviate from the Newton-Shulz polynomial. In particular, the polynomial goes above 1 on some parts of the interval and below 1 on other parts of the interval. Intuitively this makes sense: if we want the maximum deviation of $p(x)$ from 1 to be as small as possible, we should allow $p(x)$ to deviate from 1 in both directions, rather than forcing it to always be below 1 like the NS-5 polynomial does.

In fact, the optimal polynomial satisfies an equioscillation property: the maximum deviation of $p(x)$ from 1 is the same at $\ell$ , $u$ , and the two inflection points of $p$ . This type of behavior is characteristic of best approximations (and is key to how the Remez algorithm works).

Composition¶

Just like with the Newton-Shulz polynomial, we would like to use a composition of the optimal polynomial to get a better approximation. Notice that the optimal polynomial on $[\ell,u]$ maps the interval $[\ell,u]$ to a new interval centered around 1, which is often much smaller than $[\ell,u]$ . This means that in the next step of the iteration, we can use the optimal polynomial for the new interval to get an even better approximation.

In particular, let’s use an iteration of the form

\vec{X}_{t} = p_t(\vec{X}_{t-1}), \qquad \vec{X}_0 = \vec{G}.

(14)

We’ll take $p_1(x)$ to be the optimal polynomial on the original interval $[\ell,u]$ as described above. Now, define

\ell_2 = \min_{x\in[\ell,u]} p_1(x), \qquad u_2 = \max_{x\in[\ell,u]} p_1(x).

(15)

This gives us a new interval $[\ell_2,u_2]$ that contains the singular values of $p_1(\vec{G})$ , so we can take $p_2(x)$ to be the optimal polynomial on $[\ell_2,u_2]$ . This gives a new interval $[\ell_3,u_3]$ which we can use to construct $p_3$ , and so on.

# Data for first figure (original interval [l, 1])
poly_data_1 = []
h_line_data_1_lower = []
h_line_data_1_upper = []

# Data for second figure (new interval [1-E, 1+E])
poly_data_2 = []
h_line_data_2_lower = []
h_line_data_2_upper = []
rect_data_2 = []
v_line_l2_data = []
v_line_u2_data = []
label_l2_data = []
label_u2_data = []

x_vals_extended = np.linspace(0, 2.1, 200)

for slider_val, l in zip(slider_vals, l_vals):
    # First polynomial for interval [l, 1]
    a1, b1, c1, E1 = optimal_quintic(l, 1.0)
    
    for x in x_vals_extended:
        y1 = evaluate_polynomial([a1, b1, c1], x)
        poly_data_1.append({'x': float(x), 'y': float(y1), 'l_slider': float(slider_val)})
    
    h_line_data_1_lower.append({
        'x': 0.0, 'y': float(1 - E1), 'x2': 1.1, 'y2': float(1 - E1), 'l_slider': float(slider_val)
    })
    h_line_data_1_upper.append({
        'x': 0.0, 'y': float(1 + E1), 'x2': 1.1, 'y2': float(1 + E1), 'l_slider': float(slider_val)
    })
    
    # Second polynomial for interval [1-E1, 1+E1]
    l2, u2 = 1 - E1, 1 + E1
    a2, b2, c2, E2 = optimal_quintic(l2, u2)
    
    for x in x_vals_extended:
        y2 = evaluate_polynomial([a2, b2, c2], x)
        poly_data_2.append({'x': float(x), 'y': float(y2), 'l_slider': float(slider_val)})
    
    h_line_data_2_lower.append({
        'x': 0.0, 'y': float(1 - E2), 'x2': 2.1, 'y2': float(1 - E2), 'l_slider': float(slider_val)
    })
    h_line_data_2_upper.append({
        'x': 0.0, 'y': float(1 + E2), 'x2': 2.1, 'y2': float(1 + E2), 'l_slider': float(slider_val)
    })
    
    rect_data_2.append({
        'x': float(l2), 'y': 0, 'x2': float(u2), 'y2': 2, 'l_slider': float(slider_val)
    })
    v_line_l2_data.append({
        'x': float(l2), 'y': 0, 'x2': float(l2), 'y2': 2, 'l_slider': float(slider_val)
    })
    v_line_u2_data.append({
        'x': float(u2), 'y': 0, 'x2': float(u2), 'y2': 2, 'l_slider': float(slider_val)
    })
    label_l2_data.append({
        'x': float(l2), 'y': 1.96, 'text_prefix': 'x=', 'text_ell': 'ℓ', 
        'text_suffix': '₂', 'l_slider': float(slider_val)
    })
    label_u2_data.append({
        'x': float(u2), 'y': 1.96, 'text_prefix': 'x=', 'text_u': 'u', 
        'text_suffix': '₂', 'l_slider': float(slider_val)
    })

# Parameters
l_slider_param = create_l_slider_param()

# First figure
poly_1 = alt.Chart(alt.Data(values=poly_data_1)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(
    x=alt.X('x:Q', title='x', scale=alt.Scale(domain=[0, 1.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('y:Q', title='p₁(x)', scale=alt.Scale(domain=[0, 2]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

def create_h_line_filtered(data, color='#1d4ed8'):
    return alt.Chart(alt.Data(values=data)).transform_calculate(
        l_slider_rounded='round(l_slider_param * 100) / 100'
    ).transform_filter(
        alt.datum.l_slider == alt.datum.l_slider_rounded
    ).mark_line(strokeDash=[6, 2], color=color).encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

h_line_1_lower = create_h_line_filtered(h_line_data_1_lower)
h_line_1_upper = create_h_line_filtered(h_line_data_1_upper)

poly_chart_1 = (create_interval_rect(y2=2) + poly_1 + create_v_line_l(y2=2) + create_v_line_1(y2=2) + 
                create_h_line_1(x2=1.1) + h_line_1_lower + h_line_1_upper + 
                create_labels_l(y=1.96) + create_label_u(y=1.96)).properties(
    width=240, height=300, title='first polynomial'
)

# Second figure
poly_2 = alt.Chart(alt.Data(values=poly_data_2)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(
    x=alt.X('x:Q', title='x', scale=alt.Scale(domain=[0, 2.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('y:Q', title='p₂(x)', scale=alt.Scale(domain=[0, 2]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

interval_rect_2 = alt.Chart(alt.Data(values=rect_data_2)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_rect(opacity=0.1, color='gray').encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

v_line_l_2 = alt.Chart(alt.Data(values=v_line_l2_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_line(color='black').encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

v_line_u_2 = alt.Chart(alt.Data(values=v_line_u2_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).mark_line(color='black').encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

# Labels for second chart
def create_label_filtered(data, field, align, font, dx, dy=-5):
    return alt.Chart(alt.Data(values=data)).transform_calculate(
        l_slider_rounded='round(l_slider_param * 100) / 100'
    ).transform_filter(
        alt.datum.l_slider == alt.datum.l_slider_rounded
    ).mark_text(align=align, fontSize=12, font=font, dx=dx, dy=dy).encode(
        x='x:Q', y='y:Q', text=f'{field}:N'
    )

label_l_2_prefix = create_label_filtered(label_l2_data, 'text_prefix', 'right', 'KaTeX_Math', -12)
label_l_2_ell = create_label_filtered(label_l2_data, 'text_ell', 'right', 'KaTeX_Main', -6)
label_l_2_suffix = create_label_filtered(label_l2_data, 'text_suffix', 'right', 'KaTeX_Math', -2)

label_u_2_prefix = create_label_filtered(label_u2_data, 'text_prefix', 'left', 'KaTeX_Math', 2)
label_u_2_u = create_label_filtered(label_u2_data, 'text_u', 'left', 'KaTeX_Math', 16)
label_u_2_suffix = create_label_filtered(label_u2_data, 'text_suffix', 'left', 'KaTeX_Math', 22)

h_line_2_lower = create_h_line_filtered(h_line_data_2_lower)
h_line_2_upper = create_h_line_filtered(h_line_data_2_upper)

poly_chart_2 = (interval_rect_2 + poly_2 + v_line_l_2 + v_line_u_2 + create_h_line_1(x2=2.1) + 
                h_line_2_lower + h_line_2_upper + label_l_2_prefix + label_l_2_ell + label_l_2_suffix +
                label_u_2_prefix + label_u_2_u + label_u_2_suffix).properties(
    width=240, height=300, title='second polynomial'
)

chart = alt.hconcat(poly_chart_1, poly_chart_2).add_params(l_slider_param)
chart

More formally, the Polar Express polynomial is defined as follows.

The PolarExpress algorithm simply uses (14) to apply the Polar Express polynomial.

Now, let’s look at what the Polar Express polynomial looks like as we change the number of iterations $T$ !

# Generate composition data
max_iterations = 5
comp_data_pe = []
comp_data_ns = []

for slider_val, l in zip(slider_vals, l_vals):
    for num_iter in range(1, max_iterations + 1):
        # PE composition
        functions, errors = compute_polar_express(l, num_iter)
        for x in x_vals:
            result = x
            for func in functions:
                result = func['a'] * result + func['b'] * result**3 + func['c'] * result**5
                result = max(min(result, 3), -3)
            comp_data_pe.append({
                'x': float(x), 'y': float(result), 
                'l_slider': float(slider_val), 'num_iterations': int(num_iter)
            })
        
        # NS composition
        composed = compose_polynomial(NS_COEFFS_5, x_vals, num_iter)
        for x, y in zip(x_vals, composed):
            comp_data_ns.append({
                'x': float(x), 'y': float(y), 
                'l_slider': float(slider_val), 'num_iterations': int(num_iter)
            })

# Pre-compute horizontal lines
h_line_pe_data = []
h_line_ns_data = []

for slider_val, l in zip(slider_vals, l_vals):
    for num_iter in range(1, max_iterations + 1):
        # PE error bounds
        functions, errors = compute_polar_express(l, num_iter)
        if errors:
            final_error = errors[-1]['error']
            h_line_pe_data.append({
                'x': 0.0, 'y': float(1 + final_error), 'x2': 1.1, 'y2': float(1 + final_error),
                'l_slider': float(slider_val), 'num_iterations': int(num_iter)
            })
            h_line_pe_data.append({
                'x': 0.0, 'y': float(1 - final_error), 'x2': 1.1, 'y2': float(1 - final_error),
                'l_slider': float(slider_val), 'num_iterations': int(num_iter)
            })
        
        # NS horizontal line
        current_l = l
        for _ in range(num_iter - 1):
            p_l = evaluate_polynomial(NS_COEFFS_5, current_l)
            current_l = 1 - (1 - p_l)
        p_l = evaluate_polynomial(NS_COEFFS_5, current_l)
        h_line_ns_data.append({
            'x': 0.0, 'y': float(p_l), 'x2': 1.1, 'y2': float(p_l),
            'l_slider': float(slider_val), 'num_iterations': int(num_iter)
        })

# Parameters
l_slider_param = create_l_slider_param()
iter_slider = alt.binding_range(min=1, max=5, step=1, name='T: ')
iter_param = alt.param(name='iter_param', value=1, bind=iter_slider)

# NS composition line (lightgrey)
comp_ns = alt.Chart(alt.Data(values=comp_data_ns)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.num_iterations == iter_param
).mark_line(color='lightgrey', strokeWidth=2, clip=True).encode(
    x=alt.X('x:Q', title='x', scale=alt.Scale(domain=[0, 1.1]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('y:Q', title='p(x)', scale=alt.Scale(domain=[0, 2]), axis=alt.Axis(tickCount=5, grid=False, labelFont='KaTeX_Math', titleFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

# PE composition line (#1d4ed8)
comp_pe = alt.Chart(alt.Data(values=comp_data_pe)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.num_iterations == iter_param
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(x=alt.X('x:Q'), y=alt.Y('y:Q'))

# Horizontal error bounds
h_line_ns = alt.Chart(alt.Data(values=h_line_ns_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.num_iterations == iter_param
).mark_line(strokeDash=[6, 2], color='lightgrey').encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

h_line_pe = alt.Chart(alt.Data(values=h_line_pe_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.num_iterations == iter_param
).mark_line(strokeDash=[6, 2], color='#1d4ed8').encode(x='x:Q', y='y:Q', x2='x2:Q', y2='y2:Q')

comp_chart = (create_interval_rect(y2=2) + comp_ns + comp_pe + create_v_line_l(y2=2) + 
              create_v_line_1(y2=2) + create_h_line_1() + h_line_ns + h_line_pe + 
              create_labels_l(y=1.96) + create_label_u(y=1.96)).properties(
    width=500, height=300, title={'text': {'expr': '"Polar Express Composition (T = " + iter_param + ")"'}}
)

chart = comp_chart.add_params(l_slider_param, iter_param)
chart

Observe that the Polar Express polynomial is a much better to 1 than the Newton-Shulz polynomial, particularly when $\ell$ is small.

We also note that, as before, the Polar Express polynomial satisfies an equioscillation property.

Greedy is optimal¶

Remarkably, this iterative greedy procedure for constructing the polynomials is actually optimal among all procedures that use a degree $d$ polynomial at each step 🤯.

This means that no other choice of compositions of $T$ polynomials of degree $d$ outperforms Polar Express.

We now visualize the error

\max_{x\in[\ell,u]} |1-p(x)|

(20)

for the Polar Express and Newton-Shulz polynomials. When $\ell$ is small, Polar Express converges in roughly half the iterations of Newton-Shulz.

# Generate error data for all slider values
max_iterations = 10
error_data = []

for slider_val, l in zip(slider_vals, l_vals):
    ns_errors = compute_ns_errors(l, max_iterations)
    for err in ns_errors:
        err['l_slider'] = float(slider_val)
        error_data.append(err)
    
    pe_errors = compute_pe_errors(l, max_iterations)
    for err in pe_errors:
        err['l_slider'] = float(slider_val)
        error_data.append(err)

# Parameters
l_slider_param = create_l_slider_param()

# NS error line and points (grey)
error_ns = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.method == 'NS'
).mark_line(color='lightgrey', strokeWidth=2, clip=True).encode(
    x=alt.X('iteration:Q', title='iteration', scale=alt.Scale(domain=[.5, max_iterations+.5]), axis=alt.Axis(grid=False, values=list(range(1,max_iterations+1)), format='d', labelFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal')),
    y=alt.Y('error:Q', title='error', scale=alt.Scale(type='log', domain=[1e-12,2]), axis=alt.Axis(grid=False, labelFont='KaTeX_Math', labelFontWeight='normal', titleFontWeight='normal'))
)

error_ns_points = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.method == 'NS'
).mark_circle(color='lightgrey', opacity=1, size=40, clip=True).encode(
    x=alt.X('iteration:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-12,2]))
)

# PE error line and points (#1d4ed8)
error_pe = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.method == 'PE'
).mark_line(color='#1d4ed8', strokeWidth=2, clip=True).encode(
    x=alt.X('iteration:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-12,2]))
)

error_pe_points = alt.Chart(alt.Data(values=error_data)).transform_calculate(
    l_slider_rounded='round(l_slider_param * 100) / 100'
).transform_filter(
    alt.datum.l_slider == alt.datum.l_slider_rounded
).transform_filter(
    alt.datum.method == 'PE'
).mark_circle(color='#1d4ed8', opacity=1, size=40, clip=True).encode(
    x=alt.X('iteration:Q'), y=alt.Y('error:Q', scale=alt.Scale(type='log', domain=[1e-8,2]))
)

error_chart = (error_ns + error_ns_points + error_pe + error_pe_points).properties(
    width=500, height=300, title={'text': {'expr': '"Newton-Schulz Composition on [" + format(l_slider_param * l_slider_param, ".2f") + ",1]"'}}
).add_params(l_slider_param)

error_chart

Conclusion¶

The fact that the Polar Express polynomial approximation is optimal among all approximations of its form is a nice theoretical guarantee. However, since it’s not super clear why Muon works in the first place (to me at least!), so it’s also not clear that a better approximation to the polar decomposition actually leads to better performance of Muon. Nevertheless, the Polar Express algorithm is a beautiful approach to approximating the polar decomposition that seems to be a good fit for the needs of Muon and serves as a good starting point for engineering-based improvements to Muon and related algorithms.

How to Cite¶

@{chen2026polarexpressviualized,
    author       = {Tyler Chen},
    title        = {The Polar Express for Muon, Visualized},
    year         = {2026},
    url          = {https://research.chen.pw/PolarExpress}
}

Footnotes¶

Technically speaking, what we are calling the polar decomposition is actually the unitary factor of the polar decomposition. We call it the polar decomposition for simplicity.
↩

References¶

Jordan, K., Jin, Y., Boza, V., You, J., Cesista, F., Newhouse, L., & Bernstein, J. (2024). Muon: An optimizer for hidden layers in neural networks. https://kellerjordan.github.io/posts/muon/
Amsel, N., Persson, D., Musco, C., & Gower, R. M. (2026). The Polar Express: Optimal Matrix Sign Methods and their Application to the Muon Algorithm. In The Fourteenth International Conference on Learning Representations. https://arxiv.org/abs/2505.16932
Cesista, F. L., Jiacheng, Y., & Jordan, K. (2025). Squeezing 1-2% Efficiency Gains Out of Muon by Optimizing the Newton-Schulz Coefficients. https://leloykun.github.io/ponder/muon-opt-coeffs/