001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.math3.stat.regression; 019 import java.io.Serializable; 020 021 import org.apache.commons.math3.exception.OutOfRangeException; 022 import org.apache.commons.math3.distribution.TDistribution; 023 import org.apache.commons.math3.exception.MathIllegalArgumentException; 024 import org.apache.commons.math3.exception.NoDataException; 025 import org.apache.commons.math3.exception.util.LocalizedFormats; 026 import org.apache.commons.math3.util.FastMath; 027 import org.apache.commons.math3.util.Precision; 028 029 /** 030 * Estimates an ordinary least squares regression model 031 * with one independent variable. 032 * <p> 033 * <code> y = intercept + slope * x </code></p> 034 * <p> 035 * Standard errors for <code>intercept</code> and <code>slope</code> are 036 * available as well as ANOVA, r-square and Pearson's r statistics.</p> 037 * <p> 038 * Observations (x,y pairs) can be added to the model one at a time or they 039 * can be provided in a 2-dimensional array. The observations are not stored 040 * in memory, so there is no limit to the number of observations that can be 041 * added to the model.</p> 042 * <p> 043 * <strong>Usage Notes</strong>: <ul> 044 * <li> When there are fewer than two observations in the model, or when 045 * there is no variation in the x values (i.e. all x values are the same) 046 * all statistics return <code>NaN</code>. At least two observations with 047 * different x coordinates are required to estimate a bivariate regression 048 * model. 049 * </li> 050 * <li> Getters for the statistics always compute values based on the current 051 * set of observations -- i.e., you can get statistics, then add more data 052 * and get updated statistics without using a new instance. There is no 053 * "compute" method that updates all statistics. Each of the getters performs 054 * the necessary computations to return the requested statistic. 055 * </li> 056 * <li> The intercept term may be suppressed by passing {@code false} to 057 * the {@link #SimpleRegression(boolean)} constructor. When the 058 * {@code hasIntercept} property is false, the model is estimated without a 059 * constant term and {@link #getIntercept()} returns {@code 0}.</li> 060 * </ul></p> 061 * 062 * @version $Id: SimpleRegression.java 1416643 2012-12-03 19:37:14Z tn $ 063 */ 064 public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression { 065 066 /** Serializable version identifier */ 067 private static final long serialVersionUID = -3004689053607543335L; 068 069 /** sum of x values */ 070 private double sumX = 0d; 071 072 /** total variation in x (sum of squared deviations from xbar) */ 073 private double sumXX = 0d; 074 075 /** sum of y values */ 076 private double sumY = 0d; 077 078 /** total variation in y (sum of squared deviations from ybar) */ 079 private double sumYY = 0d; 080 081 /** sum of products */ 082 private double sumXY = 0d; 083 084 /** number of observations */ 085 private long n = 0; 086 087 /** mean of accumulated x values, used in updating formulas */ 088 private double xbar = 0; 089 090 /** mean of accumulated y values, used in updating formulas */ 091 private double ybar = 0; 092 093 /** include an intercept or not */ 094 private final boolean hasIntercept; 095 // ---------------------Public methods-------------------------------------- 096 097 /** 098 * Create an empty SimpleRegression instance 099 */ 100 public SimpleRegression() { 101 this(true); 102 } 103 /** 104 * Create a SimpleRegression instance, specifying whether or not to estimate 105 * an intercept. 106 * 107 * <p>Use {@code false} to estimate a model with no intercept. When the 108 * {@code hasIntercept} property is false, the model is estimated without a 109 * constant term and {@link #getIntercept()} returns {@code 0}.</p> 110 * 111 * @param includeIntercept whether or not to include an intercept term in 112 * the regression model 113 */ 114 public SimpleRegression(boolean includeIntercept) { 115 super(); 116 hasIntercept = includeIntercept; 117 } 118 119 /** 120 * Adds the observation (x,y) to the regression data set. 121 * <p> 122 * Uses updating formulas for means and sums of squares defined in 123 * "Algorithms for Computing the Sample Variance: Analysis and 124 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 125 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in 126 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> 127 * 128 * 129 * @param x independent variable value 130 * @param y dependent variable value 131 */ 132 public void addData(final double x,final double y) { 133 if (n == 0) { 134 xbar = x; 135 ybar = y; 136 } else { 137 if( hasIntercept ){ 138 final double fact1 = 1.0 + n; 139 final double fact2 = n / (1.0 + n); 140 final double dx = x - xbar; 141 final double dy = y - ybar; 142 sumXX += dx * dx * fact2; 143 sumYY += dy * dy * fact2; 144 sumXY += dx * dy * fact2; 145 xbar += dx / fact1; 146 ybar += dy / fact1; 147 } 148 } 149 if( !hasIntercept ){ 150 sumXX += x * x ; 151 sumYY += y * y ; 152 sumXY += x * y ; 153 } 154 sumX += x; 155 sumY += y; 156 n++; 157 } 158 159 160 /** 161 * Removes the observation (x,y) from the regression data set. 162 * <p> 163 * Mirrors the addData method. This method permits the use of 164 * SimpleRegression instances in streaming mode where the regression 165 * is applied to a sliding "window" of observations, however the caller is 166 * responsible for maintaining the set of observations in the window.</p> 167 * 168 * The method has no effect if there are no points of data (i.e. n=0) 169 * 170 * @param x independent variable value 171 * @param y dependent variable value 172 */ 173 public void removeData(final double x,final double y) { 174 if (n > 0) { 175 if (hasIntercept) { 176 final double fact1 = n - 1.0; 177 final double fact2 = n / (n - 1.0); 178 final double dx = x - xbar; 179 final double dy = y - ybar; 180 sumXX -= dx * dx * fact2; 181 sumYY -= dy * dy * fact2; 182 sumXY -= dx * dy * fact2; 183 xbar -= dx / fact1; 184 ybar -= dy / fact1; 185 } else { 186 final double fact1 = n - 1.0; 187 sumXX -= x * x; 188 sumYY -= y * y; 189 sumXY -= x * y; 190 xbar -= x / fact1; 191 ybar -= y / fact1; 192 } 193 sumX -= x; 194 sumY -= y; 195 n--; 196 } 197 } 198 199 /** 200 * Adds the observations represented by the elements in 201 * <code>data</code>. 202 * <p> 203 * <code>(data[0][0],data[0][1])</code> will be the first observation, then 204 * <code>(data[1][0],data[1][1])</code>, etc.</p> 205 * <p> 206 * This method does not replace data that has already been added. The 207 * observations represented by <code>data</code> are added to the existing 208 * dataset.</p> 209 * <p> 210 * To replace all data, use <code>clear()</code> before adding the new 211 * data.</p> 212 * 213 * @param data array of observations to be added 214 * @throws ModelSpecificationException if the length of {@code data[i]} is not 215 * greater than or equal to 2 216 */ 217 public void addData(final double[][] data) throws ModelSpecificationException { 218 for (int i = 0; i < data.length; i++) { 219 if( data[i].length < 2 ){ 220 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION, 221 data[i].length, 2); 222 } 223 addData(data[i][0], data[i][1]); 224 } 225 } 226 227 /** 228 * Adds one observation to the regression model. 229 * 230 * @param x the independent variables which form the design matrix 231 * @param y the dependent or response variable 232 * @throws ModelSpecificationException if the length of {@code x} does not equal 233 * the number of independent variables in the model 234 */ 235 public void addObservation(final double[] x,final double y) 236 throws ModelSpecificationException { 237 if( x == null || x.length == 0 ){ 238 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1); 239 } 240 addData( x[0], y ); 241 } 242 243 /** 244 * Adds a series of observations to the regression model. The lengths of 245 * x and y must be the same and x must be rectangular. 246 * 247 * @param x a series of observations on the independent variables 248 * @param y a series of observations on the dependent variable 249 * The length of x and y must be the same 250 * @throws ModelSpecificationException if {@code x} is not rectangular, does not match 251 * the length of {@code y} or does not contain sufficient data to estimate the model 252 */ 253 public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException { 254 if ((x == null) || (y == null) || (x.length != y.length)) { 255 throw new ModelSpecificationException( 256 LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, 257 (x == null) ? 0 : x.length, 258 (y == null) ? 0 : y.length); 259 } 260 boolean obsOk=true; 261 for( int i = 0 ; i < x.length; i++){ 262 if( x[i] == null || x[i].length == 0 ){ 263 obsOk = false; 264 } 265 } 266 if( !obsOk ){ 267 throw new ModelSpecificationException( 268 LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, 269 0, 1); 270 } 271 for( int i = 0 ; i < x.length ; i++){ 272 addData( x[i][0], y[i] ); 273 } 274 } 275 276 /** 277 * Removes observations represented by the elements in <code>data</code>. 278 * <p> 279 * If the array is larger than the current n, only the first n elements are 280 * processed. This method permits the use of SimpleRegression instances in 281 * streaming mode where the regression is applied to a sliding "window" of 282 * observations, however the caller is responsible for maintaining the set 283 * of observations in the window.</p> 284 * <p> 285 * To remove all data, use <code>clear()</code>.</p> 286 * 287 * @param data array of observations to be removed 288 */ 289 public void removeData(double[][] data) { 290 for (int i = 0; i < data.length && n > 0; i++) { 291 removeData(data[i][0], data[i][1]); 292 } 293 } 294 295 /** 296 * Clears all data from the model. 297 */ 298 public void clear() { 299 sumX = 0d; 300 sumXX = 0d; 301 sumY = 0d; 302 sumYY = 0d; 303 sumXY = 0d; 304 n = 0; 305 } 306 307 /** 308 * Returns the number of observations that have been added to the model. 309 * 310 * @return n number of observations that have been added. 311 */ 312 public long getN() { 313 return n; 314 } 315 316 /** 317 * Returns the "predicted" <code>y</code> value associated with the 318 * supplied <code>x</code> value, based on the data that has been 319 * added to the model when this method is activated. 320 * <p> 321 * <code> predict(x) = intercept + slope * x </code></p> 322 * <p> 323 * <strong>Preconditions</strong>: <ul> 324 * <li>At least two observations (with at least two different x values) 325 * must have been added before invoking this method. If this method is 326 * invoked before a model can be estimated, <code>Double,NaN</code> is 327 * returned. 328 * </li></ul></p> 329 * 330 * @param x input <code>x</code> value 331 * @return predicted <code>y</code> value 332 */ 333 public double predict(final double x) { 334 final double b1 = getSlope(); 335 if (hasIntercept) { 336 return getIntercept(b1) + b1 * x; 337 } 338 return b1 * x; 339 } 340 341 /** 342 * Returns the intercept of the estimated regression line, if 343 * {@link #hasIntercept()} is true; otherwise 0. 344 * <p> 345 * The least squares estimate of the intercept is computed using the 346 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 347 * The intercept is sometimes denoted b0.</p> 348 * <p> 349 * <strong>Preconditions</strong>: <ul> 350 * <li>At least two observations (with at least two different x values) 351 * must have been added before invoking this method. If this method is 352 * invoked before a model can be estimated, <code>Double,NaN</code> is 353 * returned. 354 * </li></ul></p> 355 * 356 * @return the intercept of the regression line if the model includes an 357 * intercept; 0 otherwise 358 * @see #SimpleRegression(boolean) 359 */ 360 public double getIntercept() { 361 return hasIntercept ? getIntercept(getSlope()) : 0.0; 362 } 363 364 /** 365 * Returns true if the model includes an intercept term. 366 * 367 * @return true if the regression includes an intercept; false otherwise 368 * @see #SimpleRegression(boolean) 369 */ 370 public boolean hasIntercept() { 371 return hasIntercept; 372 } 373 374 /** 375 * Returns the slope of the estimated regression line. 376 * <p> 377 * The least squares estimate of the slope is computed using the 378 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 379 * The slope is sometimes denoted b1.</p> 380 * <p> 381 * <strong>Preconditions</strong>: <ul> 382 * <li>At least two observations (with at least two different x values) 383 * must have been added before invoking this method. If this method is 384 * invoked before a model can be estimated, <code>Double.NaN</code> is 385 * returned. 386 * </li></ul></p> 387 * 388 * @return the slope of the regression line 389 */ 390 public double getSlope() { 391 if (n < 2) { 392 return Double.NaN; //not enough data 393 } 394 if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) { 395 return Double.NaN; //not enough variation in x 396 } 397 return sumXY / sumXX; 398 } 399 400 /** 401 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> 402 * sum of squared errors</a> (SSE) associated with the regression 403 * model. 404 * <p> 405 * The sum is computed using the computational formula</p> 406 * <p> 407 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> 408 * <p> 409 * where <code>SYY</code> is the sum of the squared deviations of the y 410 * values about their mean, <code>SXX</code> is similarly defined and 411 * <code>SXY</code> is the sum of the products of x and y mean deviations. 412 * </p><p> 413 * The sums are accumulated using the updating algorithm referenced in 414 * {@link #addData}.</p> 415 * <p> 416 * The return value is constrained to be non-negative - i.e., if due to 417 * rounding errors the computational formula returns a negative result, 418 * 0 is returned.</p> 419 * <p> 420 * <strong>Preconditions</strong>: <ul> 421 * <li>At least two observations (with at least two different x values) 422 * must have been added before invoking this method. If this method is 423 * invoked before a model can be estimated, <code>Double,NaN</code> is 424 * returned. 425 * </li></ul></p> 426 * 427 * @return sum of squared errors associated with the regression model 428 */ 429 public double getSumSquaredErrors() { 430 return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX); 431 } 432 433 /** 434 * Returns the sum of squared deviations of the y values about their mean. 435 * <p> 436 * This is defined as SSTO 437 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> 438 * <p> 439 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> 440 * 441 * @return sum of squared deviations of y values 442 */ 443 public double getTotalSumSquares() { 444 if (n < 2) { 445 return Double.NaN; 446 } 447 return sumYY; 448 } 449 450 /** 451 * Returns the sum of squared deviations of the x values about their mean. 452 * 453 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> 454 * 455 * @return sum of squared deviations of x values 456 */ 457 public double getXSumSquares() { 458 if (n < 2) { 459 return Double.NaN; 460 } 461 return sumXX; 462 } 463 464 /** 465 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>. 466 * 467 * @return sum of cross products 468 */ 469 public double getSumOfCrossProducts() { 470 return sumXY; 471 } 472 473 /** 474 * Returns the sum of squared deviations of the predicted y values about 475 * their mean (which equals the mean of y). 476 * <p> 477 * This is usually abbreviated SSR or SSM. It is defined as SSM 478 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> 479 * <p> 480 * <strong>Preconditions</strong>: <ul> 481 * <li>At least two observations (with at least two different x values) 482 * must have been added before invoking this method. If this method is 483 * invoked before a model can be estimated, <code>Double.NaN</code> is 484 * returned. 485 * </li></ul></p> 486 * 487 * @return sum of squared deviations of predicted y values 488 */ 489 public double getRegressionSumSquares() { 490 return getRegressionSumSquares(getSlope()); 491 } 492 493 /** 494 * Returns the sum of squared errors divided by the degrees of freedom, 495 * usually abbreviated MSE. 496 * <p> 497 * If there are fewer than <strong>three</strong> data pairs in the model, 498 * or if there is no variation in <code>x</code>, this returns 499 * <code>Double.NaN</code>.</p> 500 * 501 * @return sum of squared deviations of y values 502 */ 503 public double getMeanSquareError() { 504 if (n < 3) { 505 return Double.NaN; 506 } 507 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1)); 508 } 509 510 /** 511 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> 512 * Pearson's product moment correlation coefficient</a>, 513 * usually denoted r. 514 * <p> 515 * <strong>Preconditions</strong>: <ul> 516 * <li>At least two observations (with at least two different x values) 517 * must have been added before invoking this method. If this method is 518 * invoked before a model can be estimated, <code>Double,NaN</code> is 519 * returned. 520 * </li></ul></p> 521 * 522 * @return Pearson's r 523 */ 524 public double getR() { 525 double b1 = getSlope(); 526 double result = FastMath.sqrt(getRSquare()); 527 if (b1 < 0) { 528 result = -result; 529 } 530 return result; 531 } 532 533 /** 534 * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 535 * coefficient of determination</a>, 536 * usually denoted r-square. 537 * <p> 538 * <strong>Preconditions</strong>: <ul> 539 * <li>At least two observations (with at least two different x values) 540 * must have been added before invoking this method. If this method is 541 * invoked before a model can be estimated, <code>Double,NaN</code> is 542 * returned. 543 * </li></ul></p> 544 * 545 * @return r-square 546 */ 547 public double getRSquare() { 548 double ssto = getTotalSumSquares(); 549 return (ssto - getSumSquaredErrors()) / ssto; 550 } 551 552 /** 553 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> 554 * standard error of the intercept estimate</a>, 555 * usually denoted s(b0). 556 * <p> 557 * If there are fewer that <strong>three</strong> observations in the 558 * model, or if there is no variation in x, this returns 559 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is 560 * returned when the intercept is constrained to be zero 561 * 562 * @return standard error associated with intercept estimate 563 */ 564 public double getInterceptStdErr() { 565 if( !hasIntercept ){ 566 return Double.NaN; 567 } 568 return FastMath.sqrt( 569 getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX)); 570 } 571 572 /** 573 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard 574 * error of the slope estimate</a>, 575 * usually denoted s(b1). 576 * <p> 577 * If there are fewer that <strong>three</strong> data pairs in the model, 578 * or if there is no variation in x, this returns <code>Double.NaN</code>. 579 * </p> 580 * 581 * @return standard error associated with slope estimate 582 */ 583 public double getSlopeStdErr() { 584 return FastMath.sqrt(getMeanSquareError() / sumXX); 585 } 586 587 /** 588 * Returns the half-width of a 95% confidence interval for the slope 589 * estimate. 590 * <p> 591 * The 95% confidence interval is</p> 592 * <p> 593 * <code>(getSlope() - getSlopeConfidenceInterval(), 594 * getSlope() + getSlopeConfidenceInterval())</code></p> 595 * <p> 596 * If there are fewer that <strong>three</strong> observations in the 597 * model, or if there is no variation in x, this returns 598 * <code>Double.NaN</code>.</p> 599 * <p> 600 * <strong>Usage Note</strong>:<br> 601 * The validity of this statistic depends on the assumption that the 602 * observations included in the model are drawn from a 603 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 604 * Bivariate Normal Distribution</a>.</p> 605 * 606 * @return half-width of 95% confidence interval for the slope estimate 607 * @throws OutOfRangeException if the confidence interval can not be computed. 608 */ 609 public double getSlopeConfidenceInterval() throws OutOfRangeException { 610 return getSlopeConfidenceInterval(0.05d); 611 } 612 613 /** 614 * Returns the half-width of a (100-100*alpha)% confidence interval for 615 * the slope estimate. 616 * <p> 617 * The (100-100*alpha)% confidence interval is </p> 618 * <p> 619 * <code>(getSlope() - getSlopeConfidenceInterval(), 620 * getSlope() + getSlopeConfidenceInterval())</code></p> 621 * <p> 622 * To request, for example, a 99% confidence interval, use 623 * <code>alpha = .01</code></p> 624 * <p> 625 * <strong>Usage Note</strong>:<br> 626 * The validity of this statistic depends on the assumption that the 627 * observations included in the model are drawn from a 628 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 629 * Bivariate Normal Distribution</a>.</p> 630 * <p> 631 * <strong> Preconditions:</strong><ul> 632 * <li>If there are fewer that <strong>three</strong> observations in the 633 * model, or if there is no variation in x, this returns 634 * <code>Double.NaN</code>. 635 * </li> 636 * <li><code>(0 < alpha < 1)</code>; otherwise an 637 * <code>OutOfRangeException</code> is thrown. 638 * </li></ul></p> 639 * 640 * @param alpha the desired significance level 641 * @return half-width of 95% confidence interval for the slope estimate 642 * @throws OutOfRangeException if the confidence interval can not be computed. 643 */ 644 public double getSlopeConfidenceInterval(final double alpha) 645 throws OutOfRangeException { 646 if (n < 3) { 647 return Double.NaN; 648 } 649 if (alpha >= 1 || alpha <= 0) { 650 throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL, 651 alpha, 0, 1); 652 } 653 // No advertised NotStrictlyPositiveException here - will return NaN above 654 TDistribution distribution = new TDistribution(n - 2); 655 return getSlopeStdErr() * 656 distribution.inverseCumulativeProbability(1d - alpha / 2d); 657 } 658 659 /** 660 * Returns the significance level of the slope (equiv) correlation. 661 * <p> 662 * Specifically, the returned value is the smallest <code>alpha</code> 663 * such that the slope confidence interval with significance level 664 * equal to <code>alpha</code> does not include <code>0</code>. 665 * On regression output, this is often denoted <code>Prob(|t| > 0)</code> 666 * </p><p> 667 * <strong>Usage Note</strong>:<br> 668 * The validity of this statistic depends on the assumption that the 669 * observations included in the model are drawn from a 670 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 671 * Bivariate Normal Distribution</a>.</p> 672 * <p> 673 * If there are fewer that <strong>three</strong> observations in the 674 * model, or if there is no variation in x, this returns 675 * <code>Double.NaN</code>.</p> 676 * 677 * @return significance level for slope/correlation 678 * @throws org.apache.commons.math3.exception.MaxCountExceededException 679 * if the significance level can not be computed. 680 */ 681 public double getSignificance() { 682 if (n < 3) { 683 return Double.NaN; 684 } 685 // No advertised NotStrictlyPositiveException here - will return NaN above 686 TDistribution distribution = new TDistribution(n - 2); 687 return 2d * (1.0 - distribution.cumulativeProbability( 688 FastMath.abs(getSlope()) / getSlopeStdErr())); 689 } 690 691 // ---------------------Private methods----------------------------------- 692 693 /** 694 * Returns the intercept of the estimated regression line, given the slope. 695 * <p> 696 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> 697 * 698 * @param slope current slope 699 * @return the intercept of the regression line 700 */ 701 private double getIntercept(final double slope) { 702 if( hasIntercept){ 703 return (sumY - slope * sumX) / n; 704 } 705 return 0.0; 706 } 707 708 /** 709 * Computes SSR from b1. 710 * 711 * @param slope regression slope estimate 712 * @return sum of squared deviations of predicted y values 713 */ 714 private double getRegressionSumSquares(final double slope) { 715 return slope * slope * sumXX; 716 } 717 718 /** 719 * Performs a regression on data present in buffers and outputs a RegressionResults object. 720 * 721 * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true 722 * a {@code NoDataException} is thrown. If there is no intercept term, the model must 723 * contain at least 2 observations.</p> 724 * 725 * @return RegressionResults acts as a container of regression output 726 * @throws ModelSpecificationException if the model is not correctly specified 727 * @throws NoDataException if there is not sufficient data in the model to 728 * estimate the regression parameters 729 */ 730 public RegressionResults regress() throws ModelSpecificationException, NoDataException { 731 if (hasIntercept) { 732 if( n < 3 ){ 733 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); 734 } 735 if( FastMath.abs( sumXX ) > Precision.SAFE_MIN ){ 736 final double[] params = new double[]{ getIntercept(), getSlope() }; 737 final double mse = getMeanSquareError(); 738 final double _syy = sumYY + sumY * sumY / n; 739 final double[] vcv = new double[]{ 740 mse * (xbar *xbar /sumXX + 1.0 / n), 741 -xbar*mse/sumXX, 742 mse/sumXX }; 743 return new RegressionResults( 744 params, new double[][]{vcv}, true, n, 2, 745 sumY, _syy, getSumSquaredErrors(),true,false); 746 }else{ 747 final double[] params = new double[]{ sumY / n, Double.NaN }; 748 //final double mse = getMeanSquareError(); 749 final double[] vcv = new double[]{ 750 ybar / (n - 1.0), 751 Double.NaN, 752 Double.NaN }; 753 return new RegressionResults( 754 params, new double[][]{vcv}, true, n, 1, 755 sumY, sumYY, getSumSquaredErrors(),true,false); 756 } 757 }else{ 758 if (n < 2) { 759 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); 760 } 761 if( !Double.isNaN(sumXX) ){ 762 final double[] vcv = new double[]{ getMeanSquareError() / sumXX }; 763 final double[] params = new double[]{ sumXY/sumXX }; 764 return new RegressionResults( 765 params, new double[][]{vcv}, true, n, 1, 766 sumY, sumYY, getSumSquaredErrors(),false,false); 767 }else{ 768 final double[] vcv = new double[]{Double.NaN }; 769 final double[] params = new double[]{ Double.NaN }; 770 return new RegressionResults( 771 params, new double[][]{vcv}, true, n, 1, 772 Double.NaN, Double.NaN, Double.NaN,false,false); 773 } 774 } 775 } 776 777 /** 778 * Performs a regression on data present in buffers including only regressors 779 * indexed in variablesToInclude and outputs a RegressionResults object 780 * @param variablesToInclude an array of indices of regressors to include 781 * @return RegressionResults acts as a container of regression output 782 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length 783 * @throws OutOfRangeException if a requested variable is not present in model 784 */ 785 public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{ 786 if( variablesToInclude == null || variablesToInclude.length == 0){ 787 throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED); 788 } 789 if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){ 790 throw new ModelSpecificationException( 791 LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES, 792 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2); 793 } 794 795 if( hasIntercept ){ 796 if( variablesToInclude.length == 2 ){ 797 if( variablesToInclude[0] == 1 ){ 798 throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE); 799 }else if( variablesToInclude[0] != 0 ){ 800 throw new OutOfRangeException( variablesToInclude[0], 0,1 ); 801 } 802 if( variablesToInclude[1] != 1){ 803 throw new OutOfRangeException( variablesToInclude[0], 0,1 ); 804 } 805 return regress(); 806 }else{ 807 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){ 808 throw new OutOfRangeException( variablesToInclude[0],0,1 ); 809 } 810 final double _mean = sumY * sumY / n; 811 final double _syy = sumYY + _mean; 812 if( variablesToInclude[0] == 0 ){ 813 //just the mean 814 final double[] vcv = new double[]{ sumYY/(((n-1)*n)) }; 815 final double[] params = new double[]{ ybar }; 816 return new RegressionResults( 817 params, new double[][]{vcv}, true, n, 1, 818 sumY, _syy+_mean, sumYY,true,false); 819 820 }else if( variablesToInclude[0] == 1){ 821 //final double _syy = sumYY + sumY * sumY / ((double) n); 822 final double _sxx = sumXX + sumX * sumX / n; 823 final double _sxy = sumXY + sumX * sumY / n; 824 final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx); 825 final double _mse = _sse/((n-1)); 826 if( !Double.isNaN(_sxx) ){ 827 final double[] vcv = new double[]{ _mse / _sxx }; 828 final double[] params = new double[]{ _sxy/_sxx }; 829 return new RegressionResults( 830 params, new double[][]{vcv}, true, n, 1, 831 sumY, _syy, _sse,false,false); 832 }else{ 833 final double[] vcv = new double[]{Double.NaN }; 834 final double[] params = new double[]{ Double.NaN }; 835 return new RegressionResults( 836 params, new double[][]{vcv}, true, n, 1, 837 Double.NaN, Double.NaN, Double.NaN,false,false); 838 } 839 } 840 } 841 }else{ 842 if( variablesToInclude[0] != 0 ){ 843 throw new OutOfRangeException(variablesToInclude[0],0,0); 844 } 845 return regress(); 846 } 847 848 return null; 849 } 850 }