Java API By Example, From Geeks To Geeks.

# Java > Open Source Codes > org > apache > commons > math > stat > regression > SimpleRegression

28  * y = intercept + slope * x 29  *

30  * Standard errors for intercept and slope are 31  * available as well as ANOVA, r-square and Pearson's r statistics.32  *

33  * Observations (x,y pairs) can be added to the model one at a time or they 34  * can be provided in a 2-dimensional array. The observations are not stored35  * in memory, so there is no limit to the number of observations that can be36  * added to the model. 37  *

38  * Usage Notes:

39  *
• When there are fewer than two observations in the model, or when40  * there is no variation in the x values (i.e. all x values are the same) 41  * all statistics return NaN. At least two observations with42  * different x coordinates are requred to estimate a bivariate regression 43  * model.44  *
• 45  *
• getters for the statistics always compute values based on the current46  * set of observations -- i.e., you can get statistics, then add more data47  * and get updated statistics without using a new instance. There is no 48  * "compute" method that updates all statistics. Each of the getters performs49  * the necessary computations to return the requested statistic.
• 50  *
51  *52  * @version \$Revision\$ \$Date: 2005-02-26 05:11:52 -0800 (Sat, 26 Feb 2005) \$53  */54 public class SimpleRegression implements Serializable {55 56     /** Serializable version identifier */57     static final long serialVersionUID = -3004689053607543335L;58 59     /** sum of x values */60     private double sumX = 0d;61 62     /** total variation in x (sum of squared deviations from xbar) */63     private double sumXX = 0d;64 65     /** sum of y values */66     private double sumY = 0d;67 68     /** total variation in y (sum of squared deviations from ybar) */69     private double sumYY = 0d;70 71     /** sum of products */72     private double sumXY = 0d;73 74     /** number of observations */75     private long n = 0;76 77     /** mean of accumulated x values, used in updating formulas */78     private double xbar = 0;79 80     /** mean of accumulated y values, used in updating formulas */81     private double ybar = 0;82 83     // ---------------------Public methods--------------------------------------84 85     /**86      * Create an empty SimpleRegression instance87      */88     public SimpleRegression() {89         super();90     }91     92     /**93      * Adds the observation (x,y) to the regression data set.94      *

95      * Uses updating formulas for means and sums of squares defined in 96      * "Algorithms for Computing the Sample Variance: Analysis and97      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 98      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in99      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985100      *101      *102      * @param x independent variable value103      * @param y dependent variable value104      */105     public void addData(double x, double y) {106         if (n == 0) {107             xbar = x;108             ybar = y;109         } else {110             double dx = x - xbar;111             double dy = y - ybar;112             sumXX += dx * dx * (double) n / (double) (n + 1.0);113             sumYY += dy * dy * (double) n / (double) (n + 1.0);114             sumXY += dx * dy * (double) n / (double) (n + 1.0);115             xbar += dx / (double) (n + 1.0);116             ybar += dy / (double) (n + 1.0);117         }118         sumX += x;119         sumY += y;120         n++;121     }122 123     /**124      * Adds the observations represented by the elements in 125      * data.126      *

127      * (data[0][0],data[0][1]) will be the first observation, then128      * (data[1][0],data[1][1]), etc. 129      *

130      * This method does not replace data that has already been added. The131      * observations represented by data are added to the existing132      * dataset.133      *

134      * To replace all data, use clear() before adding the new 135      * data.136      * 137      * @param data array of observations to be added138      */139     public void addData(double[][] data) {140         for (int i = 0; i < data.length; i++) {141             addData(data[i][0], data[i][1]);142         }143     }144 145     /**146      * Clears all data from the model.147      */148     public void clear() {149         sumX = 0d;150         sumXX = 0d;151         sumY = 0d;152         sumYY = 0d;153         sumXY = 0d;154         n = 0;155     }156 157     /**158      * Returns the number of observations that have been added to the model.159      *160      * @return n number of observations that have been added.161      */162     public long getN() {163         return n;164     }165 166     /**167      * Returns the "predicted" y value associated with the 168      * supplied x value, based on the data that has been169      * added to the model when this method is activated.170      *

171      * predict(x) = intercept + slope * x 172      *

173      * Preconditions:

174      *
• At least two observations (with at least two different x values)175      * must have been added before invoking this method. If this method is 176      * invoked before a model can be estimated, Double,NaN is177      * returned.178      *
179      *180      * @param x input x value181      * @return predicted y value182      */183     public double predict(double x) {184         double b1 = getSlope();185         return getIntercept(b1) + b1 * x;186     }187 188     /**189      * Returns the intercept of the estimated regression line.190      *

191      * The least squares estimate of the intercept is computed using the 192      * normal equations.193      * The intercept is sometimes denoted b0. 194      *

195      * Preconditions:

196      *
• At least two observations (with at least two different x values)197      * must have been added before invoking this method. If this method is 198      * invoked before a model can be estimated, Double,NaN is199      * returned.200      *
201      *202      * @return the intercept of the regression line203      */204     public double getIntercept() {205         return getIntercept(getSlope());206     }207 208     /**209     * Returns the slope of the estimated regression line. 210     *

211     * The least squares estimate of the slope is computed using the 212     * normal equations.213     * The slope is sometimes denoted b1. 214     *

215     * Preconditions:

216     *
• At least two observations (with at least two different x values)217     * must have been added before invoking this method. If this method is 218     * invoked before a model can be estimated, Double.NaN is219     * returned.220     *
221     *222     * @return the slope of the regression line223     */224     public double getSlope() {225         if (n < 2) {226             return Double.NaN; //not enough data 227 }228         if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {229             return Double.NaN; //not enough variation in x230 }231         return sumXY / sumXX;232     }233 234     /**235      * Returns the 236      * sum of squared errors (SSE) associated with the regression 237      * model.238      *

239      * Preconditions:

240      *
• At least two observations (with at least two different x values)241      * must have been added before invoking this method. If this method is 242      * invoked before a model can be estimated, Double,NaN is243      * returned.244      *
245      *246      * @return sum of squared errors associated with the regression model247      */248     public double getSumSquaredErrors() {249         return getSumSquaredErrors(getSlope());250     }251 252     /**253      * Returns the sum of squared deviations of the y values about their mean.254      *

255      * This is defined as SSTO 256      * here.257      *

258      * If n < 2, this returns Double.NaN.259      *260      * @return sum of squared deviations of y values261      */262     public double getTotalSumSquares() {263         if (n < 2) {264             return Double.NaN;265         }266         return sumYY;267     }268 269     /**270      * Returns the sum of squared deviations of the predicted y values about 271      * their mean (which equals the mean of y).272      *

273      * This is usually abbreviated SSR or SSM. It is defined as SSM 274      * here275      *

276      * Preconditions:

277      *
• At least two observations (with at least two different x values)278      * must have been added before invoking this method. If this method is 279      * invoked before a model can be estimated, Double.NaN is280      * returned.281      *
282      *283      * @return sum of squared deviations of predicted y values284      */285     public double getRegressionSumSquares() {286         return getRegressionSumSquares(getSlope());287     }288 289     /**290      * Returns the sum of squared errors divided by the degrees of freedom,291      * usually abbreviated MSE. 292      *

293      * If there are fewer than three data pairs in the model,294      * or if there is no variation in x, this returns 295      * Double.NaN.296      *297      * @return sum of squared deviations of y values298      */299     public double getMeanSquareError() {300         if (n < 3) {301             return Double.NaN;302         }303         return getSumSquaredErrors() / (double) (n - 2);304     }305 306     /**307      * Returns 308      * Pearson's product moment correlation coefficient,309      * usually denoted r. 310      *

311      * Preconditions:

312      *
• At least two observations (with at least two different x values)313      * must have been added before invoking this method. If this method is 314      * invoked before a model can be estimated, Double,NaN is315      * returned.316      *
317      *318      * @return Pearson's r319      */320     public double getR() {321         double b1 = getSlope();322         double result = Math.sqrt(getRSquare(b1));323         if (b1 < 0) {324             result = -result;325         }326         return result;327     }328 329     /** 330      * Returns the 331      * coefficient of determination,332      * usually denoted r-square. 333      *

334      * Preconditions:

335      *
• At least two observations (with at least two different x values)336      * must have been added before invoking this method. If this method is 337      * invoked before a model can be estimated, Double,NaN is338      * returned.339      *
340      *341      * @return r-square342      */343     public double getRSquare() {344         return getRSquare(getSlope());345     }346 347     /**348      * Returns the 349      * standard error of the intercept estimate, 350      * usually denoted s(b0). 351      *

352      * If there are fewer that three observations in the 353      * model, or if there is no variation in x, this returns 354      * Double.NaN.355      *356      * @return standard error associated with intercept estimate357      */358     public double getInterceptStdErr() {359         return Math.sqrt(360             getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));361     }362 363     /**364      * Returns the standard365      * error of the slope estimate,366      * usually denoted s(b1). 367      *

368      * If there are fewer that three data pairs in the model,369      * or if there is no variation in x, this returns Double.NaN.370      *371      * @return standard error associated with slope estimate372      */373     public double getSlopeStdErr() {374         return Math.sqrt(getMeanSquareError() / sumXX);375     }376 377     /**378      * Returns the half-width of a 95% confidence interval for the slope379      * estimate.380      *

381      * The 95% confidence interval is 382      *

383      * (getSlope() - getSlopeConfidenceInterval(), 384      * getSlope() + getSlopeConfidenceInterval())385      *

386      * If there are fewer that three observations in the 387      * model, or if there is no variation in x, this returns 388      * Double.NaN.389      *

390      * Usage Note:
391      * The validity of this statistic depends on the assumption that the 392      * observations included in the model are drawn from a393      * 394      * Bivariate Normal Distribution.395      *396      * @return half-width of 95% confidence interval for the slope estimate397      * 398      * @throws MathException if the confidence interval can not be computed.399      */400     public double getSlopeConfidenceInterval() throws MathException {401         return getSlopeConfidenceInterval(0.05d);402     }403 404     /**405      * Returns the half-width of a (100-100*alpha)% confidence interval for 406      * the slope estimate.407      *

408      * The (100-100*alpha)% confidence interval is 409      *

410      * (getSlope() - getSlopeConfidenceInterval(), 411      * getSlope() + getSlopeConfidenceInterval())412      *

413      * To request, for example, a 99% confidence interval, use 414      * alpha = .01415      *

416      * Usage Note:
417      * The validity of this statistic depends on the assumption that the 418      * observations included in the model are drawn from a419      * 420      * Bivariate Normal Distribution.421      *

422      * Preconditions:

423      *
• If there are fewer that three observations in the 424      * model, or if there is no variation in x, this returns 425      * Double.NaN. 426      *
• 427      *
• (0 < alpha < 1); otherwise an 428      * IllegalArgumentException is thrown.429      *
430      *431      * @param alpha the desired significance level 432      * @return half-width of 95% confidence interval for the slope estimate433      * @throws MathException if the confidence interval can not be computed.434      */435     public double getSlopeConfidenceInterval(double alpha)436         throws MathException {437         if (alpha >= 1 || alpha <= 0) {438             throw new IllegalArgumentException ();439         }440         return getSlopeStdErr() *441             getTDistribution().inverseCumulativeProbability(1d - alpha / 2d);442     }443 444     /**445      * Returns the significance level of the slope (equiv) correlation. 446      *

447      * Specifically, the returned value is the smallest alpha448      * such that the slope confidence interval with significance level449      * equal to alpha does not include 0.450      * On regression output, this is often denoted Prob(|t| > 0)451      *

452      * Usage Note:
453      * The validity of this statistic depends on the assumption that the 454      * observations included in the model are drawn from a455      * 456      * Bivariate Normal Distribution.457      *

458      * If there are fewer that three observations in the 459      * model, or if there is no variation in x, this returns 460      * Double.NaN.461      *462      * @return significance level for slope/correlation463      * @throws MathException if the significance level can not be computed.464      */465     public double getSignificance() throws MathException {466         return 2d* (1.0 - getTDistribution().cumulativeProbability(467                     Math.abs(getSlope()) / getSlopeStdErr()));468     }469 470     // ---------------------Private methods-----------------------------------471 472     /**473     * Returns the intercept of the estimated regression line, given the slope.474     *

475     * Will return NaN if slope is NaN.476     *477     * @param slope current slope478     * @return the intercept of the regression line479     */480     private double getIntercept(double slope) {481         return (sumY - slope * sumX) / ((double) n);482     }483 484     /**485      * Returns the sum of squared errors associated with the regression 486      * model, using the slope of the regression line. 487      *

488      * Returns NaN if the slope is NaN.489      * 490      * @param b1 current slope491      * @return sum of squared errors associated with the regression model492      */493     private double getSumSquaredErrors(double b1) {494         return sumYY - sumXY * sumXY / sumXX;495     }496 497     /** 498      * Computes r-square from the slope.499      *

500      * will return NaN if slope is Nan.501      *502      * @param b1 current slope503      * @return r-square504      */505     private double getRSquare(double b1) {506         double ssto = getTotalSumSquares();507         return (ssto - getSumSquaredErrors(b1)) / ssto;508     }509 510     /**511      * Computes SSR from b1.512      * 513      * @param slope regression slope estimate514      * @return sum of squared deviations of predicted y values515      */516     private double getRegressionSumSquares(double slope) {517         return slope * slope * sumXX;518     }519 520     /**521      * Uses distribution framework to get a t distribution instance 522      * with df = n - 2523      *524      * @return t distribution with df = n - 2525      */526     private TDistribution getTDistribution() {527         return DistributionFactory.newInstance().createTDistribution(n - 2);528     }529 }530 Popular Tags