-
-
Notifications
You must be signed in to change notification settings - Fork 189
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Feature] Add a bracketing rootfinder #2859
Comments
Here's alternative implementation using today's functionality. This one uses Newton solver with two caveats:
// content of file stan/slog_pdf_dqf_bi_mod.stan
functions{ // begin the functions block
real slogis_s_qf(real p, real mu, real sigma, real delta) {
if(is_nan(p)) reject("p can not be nan!");
return mu+sigma*((1-delta)*log(p)-delta*log1m(p));
}
real slogis_s_qdf(real p, real mu, real sigma, real delta) {
return sigma*((1-delta)/p+delta/(1-p));
}
real slogis_s_ldqf_lpdf(real p, real mu, real sigma, real delta){
return -log(sigma)-log((1-delta)/p+delta/(1-p));
}
// This function is the algebra system with a signature:
// vector algebra_system (vector y, vector theta, data vector x)
vector slogis_iqf_as(vector z0, vector params, data real[] x_r){
real u=inv_logit(z0[1]);
return [x_r[1] - slogis_s_qf(u, params[1], params[2], params[3])]';
}
// this function calls a solver whih operates the algebra system
// vector solve_newton(function algebra_system, vector y_guess, ...)
real approx_slogis_cdf(data real x, real z_guess, real mu, real sigma, real delta, data real rel_tol, data real f_tol, data int max_steps){
return solve_newton_tol(slogis_iqf_as, [z_guess]', rel_tol, f_tol, max_steps, to_vector({mu, sigma, delta}), {x})[1];
}
} // end of block
data {
int<lower=0> N; // size of the data
vector[N] y; // data on variable level
real h_mu_mu;
real h_sigma_mu;
real h_lambda_sigma;
real h_a_delta;
real h_b_delta;
real rel_tol;
real f_tol;
int max_steps;
}
transformed data {
vector[N] u_guess = rep_vector(0.5, N);
vector[N] z_guess = logit(u_guess);
}
parameters {
real mu;
real<lower=0> sigma; // scale for skew-logistic
real<lower=0,upper=1> delta; //skewness for skew-logistic
}
model {
vector[N] u;
vector[N] z;
target += normal_lpdf(mu | h_mu_mu, h_sigma_mu);
target += exponential_lpdf(sigma | h_lambda_sigma);
target += beta_lpdf(delta | h_a_delta, h_b_delta);
for (i in 1:N){
z[i] = approx_slogis_cdf(y[i], z_guess[i], mu, sigma, delta, rel_tol, f_tol, max_steps);
u[i] = inv_logit(z[i]);
target += slogis_s_ldqf_lpdf(u[i] | mu, sigma, delta);
}
} which is called by library(cmdstanr)
library(magrittr)
slogis_as_mod <- cmdstanr::cmdstan_model("stan/slog_pdf_dqf_bi_mod.stan")
slogis_generator_single <- function(N){ # N is the number of data points we are generating
#vector[N] y; // data on variable level
h_mu_mu<- 0;
h_sigma_mu<- 1;
h_lambda_sigma<- 1;
h_a_delta<- 2;
h_b_delta<- 2;
true_mu <- rnorm(1, h_mu_mu, h_sigma_mu)
true_sigma <- rexp(1, h_lambda_sigma)
true_delta <- rbeta(1, h_a_delta, h_b_delta)
u <- runif(N)
y <- true_mu+true_sigma*((1-true_delta)*log(u)-true_delta*log1p(-u))
list(
variables = list(
mu=true_mu,
sigma=true_sigma,
delta=true_delta
),
generated = list(
N=N, y=y,
h_mu_mu=h_mu_mu,
h_sigma_mu=h_sigma_mu,
h_lambda_sigma=h_lambda_sigma,
h_a_delta=h_a_delta,
h_b_delta=h_a_delta,
rel_tol=1e-12, f_tol=1e1, max_steps=1e6
)
)
}
stan_data <- slogis_generator_single(100)
hist(stan_data$generated$y)
initfun <- function() list(mu=rnorm(1,0,1), sigma=rexp(1,1), delta=runif(1, 0, 1))
slogis_mod <- slogis_as_mod$sample(data=stan_data$generated,init = initfun,
iter_sampling = 1000, iter_warmup = 1000)
slogis_mod %>% posterior::as_draws_array() %>% bayesplot::mcmc_combo()
|
And here's custom implementation of the bracketing rootfinding algorithm (Brent) with a couple of caveats:
// content of the file stan/slog_pdf_dqf_br_mod.stan
functions{ // begin the functions block
real slogis_s_qf(real p, real mu, real sigma, real delta) {
if(is_nan(p)) reject("p can not be nan!");
return mu+sigma*((1-delta)*log(p)-delta*log1m(p));
}
real slogis_s_qdf(real p, real mu, real sigma, real delta) {
return sigma*((1-delta)/p+delta/(1-p));
}
real slogis_s_ldqf_lpdf(real p, real mu, real sigma, real delta){
return -log(sigma)-log((1-delta)/p+delta/(1-p));
}
// This function is the algebra system with a signature:
// vector algebra_system (vector y, vector theta, data vector x)
real rootfun(real u0, vector params, data real y_r){
return y_r - slogis_s_qf(u0, params[1], params[2], params[3]);
}
real iqf_brent01(vector params, data real y_r, data real rel_tol, data real max_steps, int verbose){
real u0=rel_tol;
real u1=1-rel_tol;
int steps_taken=0;
real f0=rootfun(u0, params, y_r);
real f1=rootfun(u1, params, y_r);
real tmp=0;
// assuming rootfun is non-increasing
// if target function at both ends is positive, the true root is above
// if target function at both ends is negative, the true root is below
if(f1>0 && f0>0) return(1-rel_tol);
if(f1<0 && f0<0) return(rel_tol);
if(is_inf(f0) || is_inf(f1)) reject("Tolerance is too small!");
// swap for non-decreasing function
if (abs(f0)<abs(f1)){
tmp=u0;
u0=u1;
u1=tmp;
tmp=f0;
f0=f1;
f1=tmp;
}
real u2=u0;
real f2=f0;
int mflag=1;
real n=0;
real d=0;
while(steps_taken<max_steps && abs(u1-u0)>rel_tol){
f0=rootfun(u0, params, y_r);
f1=rootfun(u1, params, y_r);
f2=rootfun(u2, params, y_r);
if(f0!=f2 && f1 !=f2){
real l0=(u0*f1*f2)/((f0-f1)*(f0-f2));
real l1=(u1*f0*f2)/((f1-f0)*(f1-f2));
real l2=(u2*f1*f0)/((f2-f0)*(f2-f1));
n = l0+l1+l2;
} else {
n = u1-(f1*(u1-u0)/(f1-f0));
}
if((n<(3*u0+u1)/4 || n>u1) ||
(mflag==1 && abs(n-u1)>=abs(u1-u2)/2.0) ||
(mflag==0 && abs(n-u1)>=abs(u2-d)/2.0) ||
(mflag==1 && abs(u1-u2)<rel_tol ) ||
(mflag==0 && abs(u2-d)<rel_tol)
){
n=(u0+u1)/2.0;
mflag=1;
} else {
mflag=0;
}
real fn=rootfun(n, params, y_r);
d=u2;
u2=u1;
if(f0*fn<0){
u1=n;
} else {
u0=n;
}
if(abs(f0)<abs(f1)){
tmp=u0;
u0=u1;
u1=tmp;
}
steps_taken+=1;
}
if(verbose) print("Steps taken ", steps_taken);
return u1;
}
} // end of block
data {
int<lower=0> N; // size of the data
vector[N] y; // data on variable level
real h_mu_mu;
real h_sigma_mu;
real h_lambda_sigma;
real h_a_delta;
real h_b_delta;
real rel_tol;
int max_steps;
int verbose;
}
parameters {
real mu;
real<lower=0> sigma; // scale for skew-logistic
real<lower=0,upper=1> delta; //skewness for skew-logistic
}
model {
vector[N] u;
target += normal_lpdf(mu | h_mu_mu, h_sigma_mu);
target += exponential_lpdf(sigma | h_lambda_sigma);
target += beta_lpdf(delta | h_a_delta, h_b_delta);
for (i in 1:N){
u[i] = iqf_brent01(to_vector({mu, sigma, delta}), y[i], rel_tol, max_steps, verbose);
target += slogis_s_ldqf_lpdf(u[i] | mu, sigma, delta);
}
}
called by library(cmdstanr)
library(magrittr)
slogis_as_mod <- cmdstanr::cmdstan_model("stan/slog_pdf_dqf_br_mod.stan")
slogis_generator_single <- function(N){ # N is the number of data points we are generating
#vector[N] y; // data on variable level
h_mu_mu<- 0;
h_sigma_mu<- 1;
h_lambda_sigma<- 1;
h_a_delta<- 2;
h_b_delta<- 2;
true_mu <- rnorm(1, h_mu_mu, h_sigma_mu)
true_sigma <- rexp(1, h_lambda_sigma)
true_delta <- rbeta(1, h_a_delta, h_b_delta)
u <- runif(N)
y <- true_mu+true_sigma*((1-true_delta)*log(u)-true_delta*log1p(-u))
list(
variables = list(
mu=true_mu,
sigma=true_sigma,
delta=true_delta
),
generated = list(
N=N, y=y,
h_mu_mu=h_mu_mu,
h_sigma_mu=h_sigma_mu,
h_lambda_sigma=h_lambda_sigma,
h_a_delta=h_a_delta,
h_b_delta=h_a_delta,
rel_tol=1e-12, f_tol=1e1, max_steps=1e6,
verbose=0
)
)
}
stan_data <- slogis_generator_single(100)
hist(stan_data$generated$y)
initfun <- function(){
list(mu=rnorm(1,0,1), sigma=rexp(1,1), delta=rbeta(1, 2, 2))
}
slogis_mod <- slogis_as_mod$sample(data=stan_data$generated,init = initfun,
iter_sampling = 1000, iter_warmup = 1000)
slogis_mod %>% posterior::as_draws_array() %>% bayesplot::mcmc_combo()
|
We have #2720 but it is not quite ready yet. |
Thank you for the reference, Ben. Newton-Raphson, Halley & Schröder are all derivative-based methods, although in this implementation the roots can be constrained by setting the bounds (corrected by the bisection step). The The target function What are your concerns regarding TOMS748 (or Brent) implementation? The need to have the derivative? We could specifically ask the user to provide both QF and LDQF. Checking that the derivative of the QF is positive may still be required for QF validation (which I perceive to be a separate step). How does Autodiff cope with my custom Brent implementation above? |
For any rootfinder, you would want to implicitly differentiate at the end to get the derivatives with respect to any unknown parameters. You could autodiff through things like Newton iterations but the expression tree can get really big and it is not as accurate as the implicit derivatives. Having a rootfinder that does not rely on derivatives internally to find the solution might be useful, although hopefully it would have a very similar interface to the ones that to do use derivatives. |
The interface I proposed in the original post above is following the existing algebra system API. vector inv_qf(function qf, data vector x, data real rel_tol, data int max_steps,...) Note, that this is not a generic Brent/TOMS748 rootfinder, but rather a special engine for inverting quantile functions. It may be a thin wrapper over a more general rootfinder (if bracketing rootfinder is needed elsewhere), but my function makes a number of assumptions regarding the bounds and tolerances, which is specific to the problem of inverting a Q: [0,1] -> R Just wanted to add here that TOMS748 has |
Description
Problem
When inverting a quantile function Q:$[0,1]\rightarrow\mathbb{R}$ , I want to use a bracketing rootfinder to constrain the roots to $[0,1]$ interval.
Derivative-based vs bracketing rootfinders
The existing derivative-based rootfinders (Newton, Powell) attempt to find the root outside of the$[0,1]$ interval leading to numerical issues in the quantile function calculation. The bracketing rootfinders (Brent and its iterations, incl Zhang, Riddlers, Chandrupatla, and TOMS748) require the interval, which contains the root, i.e. the interval $[a,b]$ such that $\text{sign}(\Omega(a))\neq \text{sign}(\Omega(b))$ , where $\Omega$ is the target function.
Target function for inverting QFs
For the task of inverting a quantile function (QF) the$\Omega^{-}(u \vert x,\theta)=x-Q(u\vert\theta)$ is monotonic, non-increasing target function, where $x$ is the observation, $u$ is the depth corresponding to the observation $x$ and $Q(u\vert\theta)$ is a valid (non-decreasing) quantile function1. Finding the root of the target function $\Omega(u \vert x,\theta)$ is tantamount to finding the inverse of the quantile function $Q(u\vert\theta)$ .
Quantile-based likelihoods
Given the random sample$\underline y$ , we can use the quantile function $Q_Y(u \vert \theta)$ to compute $\underline{Q}={Q(u_1), \dots Q(u_n) \vert \theta}$ , such that $u_i=F(y_i \vert \theta), i={1,2, \dots n}$ . The depths $u_i$ are degenerate random variables because they are fully determined given the values of $\underline{y}$ and the parameter $\theta$ . Since $Q_Y(u_i \vert \theta)=y_i$ we can substitute $\underline Q$ for $\underline y$ . Then the Bayesian inference formula becomes
where$\underline{Q}={Q(u_1), \dots Q(u_n) \vert \theta}$ , such that $u_i=F(y_i \vert \theta), i={1,2, \dots n}$ . In case the CDF $F(\underline y \vert \theta)$ is not available, the numeric approximation of $\widehat{Q^{-1}}(\underline{y} \vert \theta)$ has to be used (Perepolkin, Goodrich & Sahlin, 2021). This is where numerical inverting of quantile functions become useful.
Quantile functions are easily extensible using the Gilchrist's transformation rules (Gilchrist, 2000). Modification of quantile functions can help create highly flexible quantile-based likelihoods, (ref Award 2153019), which may not be invertible.
Example
While logistic quantile function$Q(u)=\text{logit(u)}=\ln(u)-\ln(1-u)$ is easily invertible, introducing the weights to the exponential components of the logistic quantile function $Q(u\vert \delta)=(1-\delta)\ln(u)-\delta\ln(1-u)$ creates the skew-logistic (SLD) quantile function, which, unfortunately, is no longer invertible.
Let's assume the data Y is inversely distributed as the Skew-Logistic Distribution2 with location$\mu$ , scale $\sigma$ and skewness $\delta$ , assigned some diffuse priors.
In Stan it would be implemented as:
generic QF inverting function with the scalar/vector signature
takes a reference to a quantile function
qf(u,...)
with the parameters passed through the ellipsis...
. The tolerancerel_tol
and maximum number of stepsmax_steps
regulate the convergence.Expected Output
The output of the$\Omega^{-}(u \vert x,\theta)=x-Q(u\vert\theta)$ for the observations $\theta$ (passed to
inv_qf()
is the vector(real) values of depthsu
, which are roots of target functionx
given the value of parameterqf()
through ellipsis).Current Version:
v4.5.0
References
Gilchrist W. 2000. Statistical modelling with quantile functions. Boca Raton: Chapman & Hall/CRC.
Footnotes
An alternative (non-decreasing) formulation of the target function $\Omega^{+}(u \vert x,\theta)=Q(u\vert\theta)-x$ is equally plausible. ↩
We say that the data Y is "inversely distributed as SLD", because SLD lacks the CDF, therefore, the data can not be "distributed as". The depth, corresponding to observations of Y is distributed as SLD (via the quantile function). Therefore Y is said to be "inversely distributed as" (Perepolkin, Goodrich & Sahlin, 2021). ↩
The text was updated successfully, but these errors were encountered: