Passengers on the London Underground network#

The London Underground, nicknamed “The Tube”, transports millions of people each day. Which factors might affect passenger numbers at different stations? Here, we use the transport network to build a sparse Gaussian process model and fit it to daily passenger numbers. We first load the prepared data, including features such as the transport zone, number of interchanges, and location, for each station. Detailes on the data preparation can be found here.

We next compile the model and draw posterior samples. The model is shown below.

functions {
    #include gptools/util.stan
    #include gptools/graph.stan
}

data {
    int num_stations, num_edges, num_zones, num_degrees;
    array [num_stations] vector[2] station_locations;
    array [num_stations] int passengers;
    array [2, num_edges] int edge_index;
    matrix[num_stations, num_zones] one_hot_zones;
    matrix[num_stations, num_degrees] one_hot_degrees;
    int include_zone_effect, include_degree_effect;
}

parameters {
    vector[num_stations] z;
    real loc;
    real<lower=0> sigma, kappa;
    real<lower=log(0.32), upper=log(31)> log_length_scale;
    vector[num_zones] zone_effect;
    vector[num_degrees] degree_effect;
}

transformed parameters {
    real length_scale = exp(log_length_scale);
    vector[num_stations] f = gp_inv_graph_exp_quad_cov(
        z, zeros_vector(num_stations), station_locations, sigma, length_scale, edge_index);
    vector[num_stations] log_mean = loc + f
        + include_zone_effect * one_hot_zones * zone_effect
        + include_degree_effect * one_hot_degrees * degree_effect;
}

model {
    z ~ std_normal();
    sigma ~ student_t(2, 0, 1);
    zone_effect ~ student_t(2, 0, 1);
    degree_effect ~ student_t(2, 0, 1);
    kappa ~ student_t(2, 0, 1);
    for (i in 1:num_stations) {
        if (passengers[i] >= 0) {
            log(passengers[i]) ~ normal(log_mean[i], kappa);
        }
    }
}

from gptools.stan import compile_model
import json
import numpy as np
import os


with open("../../data/tube-stan.json") as fp:
    data = json.load(fp)

# Sample a training mask and update the data for Stan.
seed = 0
train_frac = 0.8
np.random.seed(seed)
train_mask = np.random.uniform(0, 1, data["num_stations"]) < train_frac

# Apply the training mask and include degree and zone effects.
y = np.asarray(data["passengers"])
data.update({
    "include_zone_effect": 1,
    "include_degree_effect": 1,
    # We use -1 for held-out data.
    "passengers": np.where(train_mask, y, -1),
})


if "CI" in os.environ:
    niter = 10
elif "READTHEDOCS" in os.environ:
    niter = 200
else:
    niter = None
model_with_gp = compile_model(stan_file="tube.stan")
chains = 1 if "READTHEDOCS" in os.environ or "CI" in os.environ else 4
fit_with_gp = model_with_gp.sample(data, chains=chains, iter_warmup=niter, iter_sampling=niter,
                                   seed=seed, adapt_delta=0.9, show_progress=False)
print(fit_with_gp.diagnose())

WARNING:root:cmdstan<2.36.0 had a bug in the evaluation of Matern 3/2 kernels with different length scales for different dimensions; see https://github.com/stan-dev/math/pull/3084 for details. Your model may yield unexpected results or crash if you use nearest-neighbor Gaussian processes with Matern 3/2 kernels. Your cmdstan version is 2.31; consider upgrading.

WARNING:cmdstanpy:CmdStanModel(compile=...) is deprecated and will be removed in the next major version. The constructor will always ensure a model has a compiled executable.
If you wish to force recompilation, use force_compile=True instead.

INFO:cmdstanpy:compiling stan file /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan to exe file /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube

DEBUG:cmdstanpy:cmd: make STANCFLAGS+=--filename-in-msg=tube.stan STANCFLAGS+=--include-paths=/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan,/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube
cwd: /home/docs/.cmdstan/cmdstan-2.31.0

DEBUG:cmdstanpy:Console output:

--- Translating Stan model to C++ code ---
bin/stanc --filename-in-msg=tube.stan --include-paths=/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan,/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube --o=/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.hpp /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan

--- Compiling, linking C++ code ---
g++ -std=c++1y -pthread -D_REENTRANT -Wno-sign-compare -Wno-ignored-attributes      -I stan/lib/stan_math/lib/tbb_2020.3/include    -O3 -I src -I stan/src -I stan/lib/rapidjson_1.1.0/ -I lib/CLI11-1.9.1/ -I stan/lib/stan_math/ -I stan/lib/stan_math/lib/eigen_3.3.9 -I stan/lib/stan_math/lib/boost_1.78.0 -I stan/lib/stan_math/lib/sundials_6.1.1/include -I stan/lib/stan_math/lib/sundials_6.1.1/src/sundials    -DBOOST_DISABLE_ASSERTS          -c -Wno-ignored-attributes   -x c++ -o /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.o /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.hpp
g++ -std=c++1y -pthread -D_REENTRANT -Wno-sign-compare -Wno-ignored-attributes      -I stan/lib/stan_math/lib/tbb_2020.3/include    -O3 -I src -I stan/src -I stan/lib/rapidjson_1.1.0/ -I lib/CLI11-1.9.1/ -I stan/lib/stan_math/ -I stan/lib/stan_math/lib/eigen_3.3.9 -I stan/lib/stan_math/lib/boost_1.78.0 -I stan/lib/stan_math/lib/sundials_6.1.1/include -I stan/lib/stan_math/lib/sundials_6.1.1/src/sundials    -DBOOST_DISABLE_ASSERTS                -Wl,-L,"/home/docs/.cmdstan/cmdstan-2.31.0/stan/lib/stan_math/lib/tbb" -Wl,-rpath,"/home/docs/.cmdstan/cmdstan-2.31.0/stan/lib/stan_math/lib/tbb"      /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.o src/cmdstan/main.o        -Wl,-L,"/home/docs/.cmdstan/cmdstan-2.31.0/stan/lib/stan_math/lib/tbb" -Wl,-rpath,"/home/docs/.cmdstan/cmdstan-2.31.0/stan/lib/stan_math/lib/tbb"   stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_nvecserial.a stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_cvodes.a stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_idas.a stan/lib/stan_math/lib/sundials_6.1.1/lib/libsundials_kinsol.a  stan/lib/stan_math/lib/tbb/libtbb.so.2 -o /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube
rm -f /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.o

INFO:cmdstanpy:compiled model executable: /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube

DEBUG:cmdstanpy:cmd: /home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube info
cwd: None

DEBUG:cmdstanpy:input tempfile: /tmp/tmp4ip2ibnz/qsh0uka2.json

INFO:cmdstanpy:CmdStan start processing

DEBUG:cmdstanpy:idx 0

DEBUG:cmdstanpy:running CmdStan, num_threads: 1

DEBUG:cmdstanpy:CmdStan args: ['/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube', 'id=1', 'random', 'seed=0', 'data', 'file=/tmp/tmp4ip2ibnz/qsh0uka2.json', 'output', 'file=/tmp/tmp4ip2ibnz/tubelv_pebcs/tube-20250317165309.csv', 'method=sample', 'num_samples=200', 'num_warmup=200', 'algorithm=hmc', 'adapt', 'engaged=1', 'delta=0.9']

INFO:cmdstanpy:Chain [1] start processing

INFO:cmdstanpy:Chain [1] done processing

DEBUG:cmdstanpy:runset
RunSet: chains=1, chain_ids=[1], num_processes=1
 cmd (chain 1):
	['/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube', 'id=1', 'random', 'seed=0', 'data', 'file=/tmp/tmp4ip2ibnz/qsh0uka2.json', 'output', 'file=/tmp/tmp4ip2ibnz/tubelv_pebcs/tube-20250317165309.csv', 'method=sample', 'num_samples=200', 'num_warmup=200', 'algorithm=hmc', 'adapt', 'engaged=1', 'delta=0.9']
 retcodes=[0]
 per-chain output files (showing chain 1 only):
 csv_file:
	/tmp/tmp4ip2ibnz/tubelv_pebcs/tube-20250317165309.csv
 console_msgs (if any):
	/tmp/tmp4ip2ibnz/tubelv_pebcs/tube-20250317165309_0-stdout.txt

DEBUG:cmdstanpy:Chain 1 console:
method = sample (Default)
  sample
    num_samples = 200
    num_warmup = 200
    save_warmup = 0 (Default)
    thin = 1 (Default)
    adapt
      engaged = 1 (Default)
      gamma = 0.050000000000000003 (Default)
      delta = 0.90000000000000002
      kappa = 0.75 (Default)
      t0 = 10 (Default)
      init_buffer = 75 (Default)
      term_buffer = 50 (Default)
      window = 25 (Default)
    algorithm = hmc (Default)
      hmc
        engine = nuts (Default)
          nuts
            max_depth = 10 (Default)
        metric = diag_e (Default)
        metric_file =  (Default)
        stepsize = 1 (Default)
        stepsize_jitter = 0 (Default)
    num_chains = 1 (Default)
id = 1 (Default)
data
  file = /tmp/tmp4ip2ibnz/qsh0uka2.json
init = 2 (Default)
random
  seed = 0
output
  file = /tmp/tmp4ip2ibnz/tubelv_pebcs/tube-20250317165309.csv
  diagnostic_file =  (Default)
  refresh = 100 (Default)
  sig_figs = -1 (Default)
  profile_file = profile.csv (Default)
num_threads = 1 (Default)


Gradient evaluation took 0.000682 seconds
1000 transitions using 10 leapfrog steps per transition would take 6.82 seconds.
Adjust your expectations accordingly!


Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
Exception: Exception: Exception: Exception: Exception: gp_exp_quad_cov: magnitude is 0, but must be positive finite! (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 61, column 8, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 210, column 8, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 284, column 4, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 421, column 4, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 27, column 4 to line 28, column 91)
If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,
but if this warning occurs often then your model may be either severely ill-conditioned or misspecified.

Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
Exception: Exception: Exception: Exception: Exception: gp_exp_quad_cov: magnitude is 0, but must be positive finite! (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 61, column 8, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 210, column 8, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 284, column 4, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 421, column 4, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 27, column 4 to line 28, column 91)
If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,
but if this warning occurs often then your model may be either severely ill-conditioned or misspecified.

Iteration:   1 / 400 [  0%]  (Warmup)
Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
Exception: Exception: Exception: Exception: Exception: gp_exp_quad_cov: magnitude is 0, but must be positive finite! (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 61, column 8, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 210, column 8, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 284, column 4, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 421, column 4, included from
'/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 3, column 4) (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/docs/tube/tube.stan', line 27, column 4 to line 28, column 91)
If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,
but if this warning occurs often then your model may be either severely ill-conditioned or misspecified.

Iteration: 100 / 400 [ 25%]  (Warmup)
Iteration: 200 / 400 [ 50%]  (Warmup)
Iteration: 201 / 400 [ 50%]  (Sampling)
Iteration: 300 / 400 [ 75%]  (Sampling)
Iteration: 400 / 400 [100%]  (Sampling)

 Elapsed Time: 12.804 seconds (Warm-up)
               8.847 seconds (Sampling)
               21.651 seconds (Total)

WARNING:cmdstanpy:Non-fatal error during sampling:
Exception: Exception: Exception: Exception: Exception: gp_exp_quad_cov: magnitude is 0, but must be positive finite! (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 61, column 8, included from
	Exception: Exception: Exception: Exception: Exception: gp_exp_quad_cov: magnitude is 0, but must be positive finite! (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 61, column 8, included from
	Exception: Exception: Exception: Exception: Exception: gp_exp_quad_cov: magnitude is 0, but must be positive finite! (in '/home/docs/checkouts/readthedocs.org/user_builds/gptools-stan/checkouts/latest/python/stan/gptools/stan/gptools/graph.stan', line 61, column 8, included from
Consider re-running with show_console=True if the above output is unclear!

DEBUG:cmdstanpy:cmd: /home/docs/.cmdstan/cmdstan-2.31.0/bin/diagnose /tmp/tmp4ip2ibnz/tubelv_pebcs/tube-20250317165309.csv
cwd: None

Processing csv files: /tmp/tmp4ip2ibnz/tubelv_pebcs/tube-20250317165309.csv

Checking sampler transitions treedepth.
Treedepth satisfactory for all transitions.

Checking sampler transitions for divergences.
No divergent transitions found.

Checking E-BFMI - sampler transitions HMC potential energy.
E-BFMI satisfactory.

Effective sample size satisfactory.

Split R-hat values satisfactory all parameters.

Processing complete, no problems detected.

We construct a figure that shows the data, effects of zone and degree, and the residual effects captured by the Gaussian process.

from pathlib import Path

fig, axes = plt.subplots(2, 2, gridspec_kw={"width_ratios": [2, 1]}, figsize=(6, 6))
ax1, ax2 = axes[:, 0]
kwargs = {"marker": "o", "s": 10}


X = np.asarray(data["station_locations"])
ax1.scatter(*X[~train_mask].T, facecolor="w", edgecolor="gray", **kwargs)
pts1 = ax1.scatter(*X[train_mask].T, c=y[train_mask], norm=mpl.colors.LogNorm(vmin=np.min(y)),
                   **kwargs)

c = fit_with_gp.stan_variable("f").mean(axis=0)
vmax = np.abs(c).max()
pts2 = ax2.scatter(*X.T, c=c, vmin=-vmax, vmax=vmax,
                   **kwargs, cmap="coolwarm")
ax1.set_aspect("equal")
ax2.set_aspect("equal")

ax2.annotate("Canary Wharf", X[np.argmax(c)], (20, -12), ha="center",
             va="center", fontsize="small",
             arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=-0.5"})
ax2.annotate("Hainault\nLoop", X[np.argmin(c)], (31, 13), ha="right",
             va="center", fontsize="small",
             arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.2",
                         "patchA": None, "shrinkA": 10})

ax1.set_axis_off()
ax2.set_axis_off()
location = "top"
fraction = 0.05
cb1 = fig.colorbar(pts1, ax=ax1, label="entries & exits", location=location, fraction=fraction)
cb2 = fig.colorbar(pts2, ax=ax2, label="Gaussian process effect", location=location, fraction=fraction)

for ax in [ax1, ax2]:
    segments = []
    for u, v in np.transpose(data["edge_index"]):
        segments.append([X[u - 1], X[v - 1]])
    collection = mpl.collections.LineCollection(segments, zorder=0, color="silver")
    ax.add_collection(collection)

ax1.set_ylabel("northing (km)")
ax1.set_xlabel("easting (km)")
ax2.set_ylabel("northing (km)")
ax2.set_xlabel("easting (km)")
ax3, ax4 = axes[:, 1]

effect = fit_with_gp.stan_variable("degree_effect")
effect -= effect.mean(axis=1, keepdims=True)
line, *_ = ax3.errorbar(np.arange(effect.shape[1]) + 1, effect.mean(axis=0), effect.std(axis=0),
                        marker="o")
line.set_markeredgecolor("w")
ax3.set_ylabel("degree effect")
ax3.set_xlabel("degree")
ax3.set_xticks([1, 3, data["num_degrees"]])
ax3.set_xticklabels(["1", "3", f"{data['num_degrees']}+"])
ax3.axhline(0, color="k", ls=":")


effect = fit_with_gp.stan_variable("zone_effect")
effect -= effect.mean(axis=1, keepdims=True)
line, *_ = ax4.errorbar(np.arange(effect.shape[1]) + 1, effect.mean(axis=0), effect.std(axis=0),
                        marker="o")
line.set_markeredgecolor("w")
ax4.set_ylabel("zone effect")
ax4.set_xlabel("zone")
ax4.set_xticks([2, 4, data["num_zones"]])
ax4.set_xticklabels(["2", "4", f"{data['num_zones']}+"])
ax4.axhline(0, color="k", ls=":")

ax1.text(0.025, 0.05, "(a)", transform=ax1.transAxes)
ax2.text(0.025, 0.05, "(c)", transform=ax2.transAxes)
ax3.text(0.05, 0.95, "(b)", transform=ax3.transAxes, va="top")
ax4.text(0.95, 0.95, "(d)", transform=ax4.transAxes, va="top", ha="right")


fig.tight_layout()

workspace = Path(os.environ.get("WORKSPACE", os.getcwd()))
fig.savefig(workspace / "tube.pdf", bbox_inches="tight")
fig.savefig(workspace / "tube.png", bbox_inches="tight")

../../_images/b623eaa92cd7bc9e86cad7bfb8cef64ff506b664115ac82b65ad0b6820ddbf4f.png

On the one hand, the three northern stations of the Hainault Loop (Roding Valley, Chigwell, and Grange Hill) are underused because they are serviced by only three trains an hour whereas nearby stations (such as Hainault, Woodford, and Buckhurst Hill) are serviced by twelve trains an hour. On the other hand, Canary Wharf at the heart of the financial district has much higher use than would be expected for a station that only serves a single line in zone 2.