Benchmark SamplesLoss in 3D

Let’s compare the performances of our losses and backends as the number of samples grows from 100 to 1,000,000.

Setup

import numpy as np
import time
from matplotlib import pyplot as plt

import importlib
import torch

use_cuda = torch.cuda.is_available()

from geomloss import SamplesLoss

MAXTIME = 10 if use_cuda else 1  # Max number of seconds before we break the loop
REDTIME = (
    2 if use_cuda else 0.2
)  # Decrease the number of runs if computations take longer than 2s...
D = 3  # Let's do this in 3D

# Number of samples that we'll loop upon
NS = [
    100,
    200,
    500,
    1000,
    2000,
    5000,
    10000,
    20000,
    50000,
    100000,
    200000,
    500000,
    1000000,
]

Synthetic dataset. Feel free to use a Stanford Bunny, or whatever!

def generate_samples(N, device):
    """Create point clouds sampled non-uniformly on a sphere of diameter 1."""

    x = torch.randn(N, D, device=device)
    x[:, 0] += 1
    x = x / (2 * x.norm(dim=1, keepdim=True))

    y = torch.randn(N, D, device=device)
    y[:, 1] += 2
    y = y / (2 * y.norm(dim=1, keepdim=True))

    x.requires_grad = True

    # Draw random weights:
    a = torch.randn(N, device=device)
    b = torch.randn(N, device=device)

    # And normalize them:
    a = a.abs()
    b = b.abs()
    a = a / a.sum()
    b = b / b.sum()

    return a, x, b, y

Benchmarking loops.

def benchmark(Loss, dev, N, loops=10):
    """Times a loss computation+gradient on an N-by-N problem."""

    importlib.reload(torch)  # In case we had a memory overflow just before...
    device = torch.device(dev)
    a, x, b, y = generate_samples(N, device)

    # We simply benchmark a Loss + gradien wrt. x
    code = "L = Loss( a, x, b, y ) ; L.backward()"
    Loss.verbose = True
    exec(code, locals())  # Warmup run, to compile and load everything
    Loss.verbose = False

    t_0 = time.perf_counter()  # Actual benchmark --------------------
    if use_cuda:
        torch.cuda.synchronize()
    for i in range(loops):
        exec(code, locals())
    if use_cuda:
        torch.cuda.synchronize()
    elapsed = time.perf_counter() - t_0  # ---------------------------

    print(
        "{:3} NxN loss, with N ={:7}: {:3}x{:3.6f}s".format(
            loops, N, loops, elapsed / loops
        )
    )
    return elapsed / loops


def bench_config(Loss, dev):
    """Times a loss computation+gradient for an increasing number of samples."""

    print("Backend : {}, Device : {} -------------".format(Loss.backend, dev))

    times = []

    def run_bench():
        try:
            Nloops = [100, 10, 1]
            nloops = Nloops.pop(0)
            for n in NS:
                elapsed = benchmark(Loss, dev, n, loops=nloops)

                times.append(elapsed)
                if (nloops * elapsed > MAXTIME) or (
                    nloops * elapsed > REDTIME and len(Nloops) > 0
                ):
                    nloops = Nloops.pop(0)

        except IndexError:
            print("**\nToo slow !")

    try:
        run_bench()

    except RuntimeError as err:
        if str(err)[:4] == "CUDA":
            print("**\nMemory overflow !")

        else:
            # CUDA memory overflows semi-break the internal
            # torch state and may cause some strange bugs.
            # In this case, best option is simply to re-launch
            # the benchmark.
            run_bench()

    return times + (len(NS) - len(times)) * [np.nan]


def full_bench(loss, *args, **kwargs):
    """Benchmarks the varied backends of a geometric loss function."""

    print("Benchmarking : ===============================")

    lines = [NS]
    backends = ["tensorized", "online", "multiscale"]
    for backend in backends:
        Loss = SamplesLoss(*args, **kwargs, backend=backend)
        lines.append(bench_config(Loss, "cuda" if use_cuda else "cpu"))

    benches = np.array(lines).T

    # Creates a pyplot figure:
    plt.figure()
    linestyles = ["o-", "s-", "^-"]
    for i, backend in enumerate(backends):
        plt.plot(
            benches[:, 0],
            benches[:, i + 1],
            linestyles[i],
            linewidth=2,
            label='backend="{}"'.format(backend),
        )

    plt.title('Runtime for SamplesLoss("{}") in dimension {}'.format(Loss.loss, D))
    plt.xlabel("Number of samples per measure")
    plt.ylabel("Seconds")
    plt.yscale("log")
    plt.xscale("log")
    plt.legend(loc="upper left")
    plt.grid(True, which="major", linestyle="-")
    plt.grid(True, which="minor", linestyle="dotted")
    plt.axis([NS[0], NS[-1], 1e-3, MAXTIME])
    plt.tight_layout()

    # Save as a .csv to put a nice Tikz figure in the papers:
    header = "Npoints " + " ".join(backends)
    np.savetxt(
        "output/benchmark_" + Loss.loss + "_3D.csv",
        benches,
        fmt="%-9.5f",
        header=header,
        comments="",
    )

Gaussian MMD, with a small blur

full_bench(SamplesLoss, "gaussian", blur=0.1, truncate=3)
Runtime for SamplesLoss(
Benchmarking : ===============================
Backend : tensorized, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.001295s
100 NxN loss, with N =    200: 100x0.001298s
100 NxN loss, with N =    500: 100x0.001305s
100 NxN loss, with N =   1000: 100x0.001305s
100 NxN loss, with N =   2000: 100x0.001300s
100 NxN loss, with N =   5000: 100x0.004568s
100 NxN loss, with N =  10000: 100x0.017096s
100 NxN loss, with N =  20000: 100x0.067320s
 10 NxN loss, with N =  50000:  10x0.457256s
**
Memory overflow !
Backend : online, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.002317s
100 NxN loss, with N =    200: 100x0.002348s
100 NxN loss, with N =    500: 100x0.002417s
100 NxN loss, with N =   1000: 100x0.002549s
100 NxN loss, with N =   2000: 100x0.002803s
100 NxN loss, with N =   5000: 100x0.003551s
100 NxN loss, with N =  10000: 100x0.004757s
100 NxN loss, with N =  20000: 100x0.007014s
100 NxN loss, with N =  50000: 100x0.027529s
 10 NxN loss, with N = 100000:  10x0.081861s
 10 NxN loss, with N = 200000:  10x0.308800s
  1 NxN loss, with N = 500000:   1x1.879292s
  1 NxN loss, with N =1000000:   1x7.315116s
Backend : multiscale, Device : cuda -------------
86x72 clusters, computed at scale = 0.779
100 NxN loss, with N =    100: 100x0.078562s
147x114 clusters, computed at scale = 0.782
 10 NxN loss, with N =    200:  10x0.120746s
283x187 clusters, computed at scale = 0.785
 10 NxN loss, with N =    500:  10x0.205427s
378x231 clusters, computed at scale = 0.791
  1 NxN loss, with N =   1000:   1x0.261250s
486x282 clusters, computed at scale = 0.793
  1 NxN loss, with N =   2000:   1x0.324979s
573x388 clusters, computed at scale = 0.793
  1 NxN loss, with N =   5000:   1x0.229984s
644x452 clusters, computed at scale = 0.794
  1 NxN loss, with N =  10000:   1x0.264931s
667x523 clusters, computed at scale = 0.794
  1 NxN loss, with N =  20000:   1x0.141090s
694x610 clusters, computed at scale = 0.794
  1 NxN loss, with N =  50000:   1x0.177706s
703x654 clusters, computed at scale = 0.794
  1 NxN loss, with N = 100000:   1x0.220860s
709x680 clusters, computed at scale = 0.794
  1 NxN loss, with N = 200000:   1x0.368317s
718x702 clusters, computed at scale = 0.794
  1 NxN loss, with N = 500000:   1x1.237671s
723x704 clusters, computed at scale = 0.794
  1 NxN loss, with N =1000000:   1x4.152498s

Energy Distance MMD

full_bench(SamplesLoss, "energy")
Runtime for SamplesLoss(
Benchmarking : ===============================
Backend : tensorized, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.001298s
100 NxN loss, with N =    200: 100x0.001294s
100 NxN loss, with N =    500: 100x0.001302s
100 NxN loss, with N =   1000: 100x0.001303s
100 NxN loss, with N =   2000: 100x0.001298s
100 NxN loss, with N =   5000: 100x0.005088s
100 NxN loss, with N =  10000: 100x0.019103s
100 NxN loss, with N =  20000: 100x0.075266s
 10 NxN loss, with N =  50000:  10x0.514765s
**
Memory overflow !
Backend : online, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.002094s
100 NxN loss, with N =    200: 100x0.002120s
100 NxN loss, with N =    500: 100x0.002211s
100 NxN loss, with N =   1000: 100x0.002348s
100 NxN loss, with N =   2000: 100x0.002556s
100 NxN loss, with N =   5000: 100x0.003236s
100 NxN loss, with N =  10000: 100x0.004323s
100 NxN loss, with N =  20000: 100x0.006354s
100 NxN loss, with N =  50000: 100x0.024542s
 10 NxN loss, with N = 100000:  10x0.073084s
 10 NxN loss, with N = 200000:  10x0.275950s
  1 NxN loss, with N = 500000:   1x1.677447s
  1 NxN loss, with N =1000000:   1x6.527392s
Backend : multiscale, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.002062s
100 NxN loss, with N =    200: 100x0.002101s
100 NxN loss, with N =    500: 100x0.002183s
100 NxN loss, with N =   1000: 100x0.002290s
100 NxN loss, with N =   2000: 100x0.002509s
100 NxN loss, with N =   5000: 100x0.003198s
100 NxN loss, with N =  10000: 100x0.004272s
100 NxN loss, with N =  20000: 100x0.006326s
100 NxN loss, with N =  50000: 100x0.024524s
 10 NxN loss, with N = 100000:  10x0.073177s
 10 NxN loss, with N = 200000:  10x0.275970s
  1 NxN loss, with N = 500000:   1x1.677247s
  1 NxN loss, with N =1000000:   1x6.526809s

Sinkhorn divergence

With a medium blurring scale, at one twentieth of the configuration’s diameter:

full_bench(SamplesLoss, "sinkhorn", p=2, blur=0.05, diameter=1)
Runtime for SamplesLoss(
Benchmarking : ===============================
Backend : tensorized, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.005081s
100 NxN loss, with N =    200: 100x0.005107s
100 NxN loss, with N =    500: 100x0.005111s
100 NxN loss, with N =   1000: 100x0.005114s
100 NxN loss, with N =   2000: 100x0.005262s
100 NxN loss, with N =   5000: 100x0.028884s
 10 NxN loss, with N =  10000:  10x0.115623s
 10 NxN loss, with N =  20000:  10x0.456373s
  1 NxN loss, with N =  50000:   1x3.718372s
**
Memory overflow !
Backend : online, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.009242s
100 NxN loss, with N =    200: 100x0.009537s
100 NxN loss, with N =    500: 100x0.010473s
100 NxN loss, with N =   1000: 100x0.011851s
100 NxN loss, with N =   2000: 100x0.013679s
100 NxN loss, with N =   5000: 100x0.020111s
 10 NxN loss, with N =  10000:  10x0.031053s
 10 NxN loss, with N =  20000:  10x0.052745s
 10 NxN loss, with N =  50000:  10x0.241384s
  1 NxN loss, with N = 100000:   1x0.745304s
  1 NxN loss, with N = 200000:   1x2.847517s
  1 NxN loss, with N = 500000:   1x17.308899s
**
Too slow !
Backend : multiscale, Device : cuda -------------
95x83 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =    100: 100x0.011557s
174x158 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =    200: 100x0.011937s
385x283 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =    500: 100x0.012561s
645x433 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =   1000: 100x0.013104s
911x594 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =   2000: 100x0.013720s
1332x834 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =   5000: 100x0.014843s
1622x1046 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =  10000: 100x0.015395s
1824x1256 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =  20000: 100x0.015607s
2009x1557 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N =  50000: 100x0.016825s
2075x1755 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N = 100000: 100x0.018993s
2128x1905 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
100 NxN loss, with N = 200000: 100x0.024494s
2164x2064 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
 10 NxN loss, with N = 500000:  10x0.043355s
2189x2112 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.050
Extrapolate from coarse to fine after the last iteration.
 10 NxN loss, with N =1000000:  10x0.081246s

With a small blurring scale, at one hundredth of the configuration’s diameter:

full_bench(SamplesLoss, "sinkhorn", p=2, blur=0.01, diameter=1)

plt.show()
Runtime for SamplesLoss(
Benchmarking : ===============================
Backend : tensorized, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.006009s
100 NxN loss, with N =    200: 100x0.006025s
100 NxN loss, with N =    500: 100x0.006027s
100 NxN loss, with N =   1000: 100x0.006025s
100 NxN loss, with N =   2000: 100x0.006253s
100 NxN loss, with N =   5000: 100x0.034260s
 10 NxN loss, with N =  10000:  10x0.137050s
 10 NxN loss, with N =  20000:  10x0.540808s
  1 NxN loss, with N =  50000:   1x4.203203s
**
Memory overflow !
Backend : online, Device : cuda -------------
100 NxN loss, with N =    100: 100x0.011050s
100 NxN loss, with N =    200: 100x0.011498s
100 NxN loss, with N =    500: 100x0.012587s
100 NxN loss, with N =   1000: 100x0.014234s
100 NxN loss, with N =   2000: 100x0.016382s
100 NxN loss, with N =   5000: 100x0.024119s
 10 NxN loss, with N =  10000:  10x0.037300s
 10 NxN loss, with N =  20000:  10x0.063547s
 10 NxN loss, with N =  50000:  10x0.291845s
  1 NxN loss, with N = 100000:   1x0.904322s
  1 NxN loss, with N = 200000:   1x3.442805s
  1 NxN loss, with N = 500000:   1x20.932180s
**
Too slow !
Backend : multiscale, Device : cuda -------------
94x86 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 1085/8084 = 13.4% of the coarse cost matrix.
Keep 1088/8836 = 12.3% of the coarse cost matrix.
Keep 1644/7396 = 22.2% of the coarse cost matrix.
100 NxN loss, with N =    100: 100x0.222965s
178x161 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 4032/28658 = 14.1% of the coarse cost matrix.
Keep 4028/31684 = 12.7% of the coarse cost matrix.
Keep 5221/25921 = 20.1% of the coarse cost matrix.
 10 NxN loss, with N =    200:  10x0.387265s
384x293 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 14575/112512 = 13.0% of the coarse cost matrix.
Keep 16720/147456 = 11.3% of the coarse cost matrix.
Keep 16433/85849 = 19.1% of the coarse cost matrix.
  1 NxN loss, with N =    500:   1x0.739979s
628x435 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 33743/273180 = 12.4% of the coarse cost matrix.
Keep 40998/394384 = 10.4% of the coarse cost matrix.
Keep 32103/189225 = 17.0% of the coarse cost matrix.
  1 NxN loss, with N =   1000:   1x0.622462s
921x568 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 60156/523128 = 11.5% of the coarse cost matrix.
Keep 78019/848241 = 9.2% of the coarse cost matrix.
Keep 47842/322624 = 14.8% of the coarse cost matrix.
  1 NxN loss, with N =   2000:   1x0.409141s
1353x805 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 112129/1089165 = 10.3% of the coarse cost matrix.
Keep 151917/1830609 = 8.3% of the coarse cost matrix.
Keep 79905/648025 = 12.3% of the coarse cost matrix.
  1 NxN loss, with N =   5000:   1x0.239286s
1611x1031 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 161579/1660941 = 9.7% of the coarse cost matrix.
Keep 209627/2595321 = 8.1% of the coarse cost matrix.
Keep 117177/1062961 = 11.0% of the coarse cost matrix.
  1 NxN loss, with N =  10000:   1x0.066433s
1831x1271 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 214853/2327201 = 9.2% of the coarse cost matrix.
Keep 268309/3352561 = 8.0% of the coarse cost matrix.
Keep 159221/1615441 = 9.9% of the coarse cost matrix.
  1 NxN loss, with N =  20000:   1x0.076065s
2009x1567 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 279988/3148103 = 8.9% of the coarse cost matrix.
Keep 322921/4036081 = 8.0% of the coarse cost matrix.
Keep 225049/2455489 = 9.2% of the coarse cost matrix.
  1 NxN loss, with N =  50000:   1x0.110892s
2081x1721 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 311185/3581401 = 8.7% of the coarse cost matrix.
Keep 347463/4330561 = 8.0% of the coarse cost matrix.
Keep 264005/2961841 = 8.9% of the coarse cost matrix.
  1 NxN loss, with N = 100000:   1x0.191689s
2130x1938 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 348205/4127940 = 8.4% of the coarse cost matrix.
Keep 364516/4536900 = 8.0% of the coarse cost matrix.
Keep 330772/3755844 = 8.8% of the coarse cost matrix.
  1 NxN loss, with N = 200000:   1x0.413915s
2159x2044 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 366486/4412996 = 8.3% of the coarse cost matrix.
Keep 375159/4661281 = 8.0% of the coarse cost matrix.
Keep 369528/4177936 = 8.8% of the coarse cost matrix.
  1 NxN loss, with N = 500000:   1x1.667767s
2181x2115 clusters, computed at scale = 0.046
Successive scales :  1.000, 1.000, 0.500, 0.250, 0.125, 0.063, 0.031, 0.016, 0.010
Jump from coarse to fine between indices 5 (σ=0.063) and 6 (σ=0.031).
Keep 380011/4612815 = 8.2% of the coarse cost matrix.
Keep 383005/4756761 = 8.1% of the coarse cost matrix.
Keep 397623/4473225 = 8.9% of the coarse cost matrix.
  1 NxN loss, with N =1000000:   1x5.737766s

Total running time of the script: ( 5 minutes 57.460 seconds)

Gallery generated by Sphinx-Gallery