KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > commons > math > stat > regression > SimpleRegression


1 /*
2  * Copyright 2003-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.commons.math.stat.regression;
18 import java.io.Serializable JavaDoc;
19
20 import org.apache.commons.math.MathException;
21 import org.apache.commons.math.distribution.DistributionFactory;
22 import org.apache.commons.math.distribution.TDistribution;
23
24 /**
25  * Estimates an ordinary least squares regression model
26  * with one independent variable.
27  * <p>
28  * <code> y = intercept + slope * x </code>
29  * <p>
30  * Standard errors for <code>intercept</code> and <code>slope</code> are
31  * available as well as ANOVA, r-square and Pearson's r statistics.
32  * <p>
33  * Observations (x,y pairs) can be added to the model one at a time or they
34  * can be provided in a 2-dimensional array. The observations are not stored
35  * in memory, so there is no limit to the number of observations that can be
36  * added to the model.
37  * <p>
38  * <strong>Usage Notes</strong>: <ul>
39  * <li> When there are fewer than two observations in the model, or when
40  * there is no variation in the x values (i.e. all x values are the same)
41  * all statistics return <code>NaN</code>. At least two observations with
42  * different x coordinates are requred to estimate a bivariate regression
43  * model.
44  * </li>
45  * <li> getters for the statistics always compute values based on the current
46  * set of observations -- i.e., you can get statistics, then add more data
47  * and get updated statistics without using a new instance. There is no
48  * "compute" method that updates all statistics. Each of the getters performs
49  * the necessary computations to return the requested statistic.</li>
50  * </ul>
51  *
52  * @version $Revision$ $Date: 2005-02-26 05:11:52 -0800 (Sat, 26 Feb 2005) $
53  */

54 public class SimpleRegression implements Serializable JavaDoc {
55
56     /** Serializable version identifier */
57     static final long serialVersionUID = -3004689053607543335L;
58
59     /** sum of x values */
60     private double sumX = 0d;
61
62     /** total variation in x (sum of squared deviations from xbar) */
63     private double sumXX = 0d;
64
65     /** sum of y values */
66     private double sumY = 0d;
67
68     /** total variation in y (sum of squared deviations from ybar) */
69     private double sumYY = 0d;
70
71     /** sum of products */
72     private double sumXY = 0d;
73
74     /** number of observations */
75     private long n = 0;
76
77     /** mean of accumulated x values, used in updating formulas */
78     private double xbar = 0;
79
80     /** mean of accumulated y values, used in updating formulas */
81     private double ybar = 0;
82
83     // ---------------------Public methods--------------------------------------
84

85     /**
86      * Create an empty SimpleRegression instance
87      */

88     public SimpleRegression() {
89         super();
90     }
91     
92     /**
93      * Adds the observation (x,y) to the regression data set.
94      * <p>
95      * Uses updating formulas for means and sums of squares defined in
96      * "Algorithms for Computing the Sample Variance: Analysis and
97      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
98      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
99      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985
100      *
101      *
102      * @param x independent variable value
103      * @param y dependent variable value
104      */

105     public void addData(double x, double y) {
106         if (n == 0) {
107             xbar = x;
108             ybar = y;
109         } else {
110             double dx = x - xbar;
111             double dy = y - ybar;
112             sumXX += dx * dx * (double) n / (double) (n + 1.0);
113             sumYY += dy * dy * (double) n / (double) (n + 1.0);
114             sumXY += dx * dy * (double) n / (double) (n + 1.0);
115             xbar += dx / (double) (n + 1.0);
116             ybar += dy / (double) (n + 1.0);
117         }
118         sumX += x;
119         sumY += y;
120         n++;
121     }
122
123     /**
124      * Adds the observations represented by the elements in
125      * <code>data</code>.
126      * <p>
127      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
128      * <code>(data[1][0],data[1][1])</code>, etc.
129      * <p>
130      * This method does not replace data that has already been added. The
131      * observations represented by <code>data</code> are added to the existing
132      * dataset.
133      * <p>
134      * To replace all data, use <code>clear()</code> before adding the new
135      * data.
136      *
137      * @param data array of observations to be added
138      */

139     public void addData(double[][] data) {
140         for (int i = 0; i < data.length; i++) {
141             addData(data[i][0], data[i][1]);
142         }
143     }
144
145     /**
146      * Clears all data from the model.
147      */

148     public void clear() {
149         sumX = 0d;
150         sumXX = 0d;
151         sumY = 0d;
152         sumYY = 0d;
153         sumXY = 0d;
154         n = 0;
155     }
156
157     /**
158      * Returns the number of observations that have been added to the model.
159      *
160      * @return n number of observations that have been added.
161      */

162     public long getN() {
163         return n;
164     }
165
166     /**
167      * Returns the "predicted" <code>y</code> value associated with the
168      * supplied <code>x</code> value, based on the data that has been
169      * added to the model when this method is activated.
170      * <p>
171      * <code> predict(x) = intercept + slope * x </code>
172      * <p>
173      * <strong>Preconditions</strong>: <ul>
174      * <li>At least two observations (with at least two different x values)
175      * must have been added before invoking this method. If this method is
176      * invoked before a model can be estimated, <code>Double,NaN</code> is
177      * returned.
178      * </li></ul>
179      *
180      * @param x input <code>x</code> value
181      * @return predicted <code>y</code> value
182      */

183     public double predict(double x) {
184         double b1 = getSlope();
185         return getIntercept(b1) + b1 * x;
186     }
187
188     /**
189      * Returns the intercept of the estimated regression line.
190      * <p>
191      * The least squares estimate of the intercept is computed using the
192      * <a HREF="http://www.xycoon.com/estimation4.htm">normal equations</a>.
193      * The intercept is sometimes denoted b0.
194      * <p>
195      * <strong>Preconditions</strong>: <ul>
196      * <li>At least two observations (with at least two different x values)
197      * must have been added before invoking this method. If this method is
198      * invoked before a model can be estimated, <code>Double,NaN</code> is
199      * returned.
200      * </li></ul>
201      *
202      * @return the intercept of the regression line
203      */

204     public double getIntercept() {
205         return getIntercept(getSlope());
206     }
207
208     /**
209     * Returns the slope of the estimated regression line.
210     * <p>
211     * The least squares estimate of the slope is computed using the
212     * <a HREF="http://www.xycoon.com/estimation4.htm">normal equations</a>.
213     * The slope is sometimes denoted b1.
214     * <p>
215     * <strong>Preconditions</strong>: <ul>
216     * <li>At least two observations (with at least two different x values)
217     * must have been added before invoking this method. If this method is
218     * invoked before a model can be estimated, <code>Double.NaN</code> is
219     * returned.
220     * </li></ul>
221     *
222     * @return the slope of the regression line
223     */

224     public double getSlope() {
225         if (n < 2) {
226             return Double.NaN; //not enough data
227
}
228         if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
229             return Double.NaN; //not enough variation in x
230
}
231         return sumXY / sumXX;
232     }
233
234     /**
235      * Returns the <a HREF="http://www.xycoon.com/SumOfSquares.htm">
236      * sum of squared errors</a> (SSE) associated with the regression
237      * model.
238      * <p>
239      * <strong>Preconditions</strong>: <ul>
240      * <li>At least two observations (with at least two different x values)
241      * must have been added before invoking this method. If this method is
242      * invoked before a model can be estimated, <code>Double,NaN</code> is
243      * returned.
244      * </li></ul>
245      *
246      * @return sum of squared errors associated with the regression model
247      */

248     public double getSumSquaredErrors() {
249         return getSumSquaredErrors(getSlope());
250     }
251
252     /**
253      * Returns the sum of squared deviations of the y values about their mean.
254      * <p>
255      * This is defined as SSTO
256      * <a HREF="http://www.xycoon.com/SumOfSquares.htm">here</a>.
257      * <p>
258      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.
259      *
260      * @return sum of squared deviations of y values
261      */

262     public double getTotalSumSquares() {
263         if (n < 2) {
264             return Double.NaN;
265         }
266         return sumYY;
267     }
268
269     /**
270      * Returns the sum of squared deviations of the predicted y values about
271      * their mean (which equals the mean of y).
272      * <p>
273      * This is usually abbreviated SSR or SSM. It is defined as SSM
274      * <a HREF="http://www.xycoon.com/SumOfSquares.htm">here</a>
275      * <p>
276      * <strong>Preconditions</strong>: <ul>
277      * <li>At least two observations (with at least two different x values)
278      * must have been added before invoking this method. If this method is
279      * invoked before a model can be estimated, <code>Double.NaN</code> is
280      * returned.
281      * </li></ul>
282      *
283      * @return sum of squared deviations of predicted y values
284      */

285     public double getRegressionSumSquares() {
286         return getRegressionSumSquares(getSlope());
287     }
288
289     /**
290      * Returns the sum of squared errors divided by the degrees of freedom,
291      * usually abbreviated MSE.
292      * <p>
293      * If there are fewer than <strong>three</strong> data pairs in the model,
294      * or if there is no variation in <code>x</code>, this returns
295      * <code>Double.NaN</code>.
296      *
297      * @return sum of squared deviations of y values
298      */

299     public double getMeanSquareError() {
300         if (n < 3) {
301             return Double.NaN;
302         }
303         return getSumSquaredErrors() / (double) (n - 2);
304     }
305
306     /**
307      * Returns <a HREF="http://mathworld.wolfram.com/CorrelationCoefficient.html">
308      * Pearson's product moment correlation coefficient</a>,
309      * usually denoted r.
310      * <p>
311      * <strong>Preconditions</strong>: <ul>
312      * <li>At least two observations (with at least two different x values)
313      * must have been added before invoking this method. If this method is
314      * invoked before a model can be estimated, <code>Double,NaN</code> is
315      * returned.
316      * </li></ul>
317      *
318      * @return Pearson's r
319      */

320     public double getR() {
321         double b1 = getSlope();
322         double result = Math.sqrt(getRSquare(b1));
323         if (b1 < 0) {
324             result = -result;
325         }
326         return result;
327     }
328
329     /**
330      * Returns the <a HREF="http://www.xycoon.com/coefficient1.htm">
331      * coefficient of determination</a>,
332      * usually denoted r-square.
333      * <p>
334      * <strong>Preconditions</strong>: <ul>
335      * <li>At least two observations (with at least two different x values)
336      * must have been added before invoking this method. If this method is
337      * invoked before a model can be estimated, <code>Double,NaN</code> is
338      * returned.
339      * </li></ul>
340      *
341      * @return r-square
342      */

343     public double getRSquare() {
344         return getRSquare(getSlope());
345     }
346
347     /**
348      * Returns the <a HREF="http://www.xycoon.com/standarderrorb0.htm">
349      * standard error of the intercept estimate</a>,
350      * usually denoted s(b0).
351      * <p>
352      * If there are fewer that <strong>three</strong> observations in the
353      * model, or if there is no variation in x, this returns
354      * <code>Double.NaN</code>.
355      *
356      * @return standard error associated with intercept estimate
357      */

358     public double getInterceptStdErr() {
359         return Math.sqrt(
360             getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
361     }
362
363     /**
364      * Returns the <a HREF="http://www.xycoon.com/standerrorb(1).htm">standard
365      * error of the slope estimate</a>,
366      * usually denoted s(b1).
367      * <p>
368      * If there are fewer that <strong>three</strong> data pairs in the model,
369      * or if there is no variation in x, this returns <code>Double.NaN</code>.
370      *
371      * @return standard error associated with slope estimate
372      */

373     public double getSlopeStdErr() {
374         return Math.sqrt(getMeanSquareError() / sumXX);
375     }
376
377     /**
378      * Returns the half-width of a 95% confidence interval for the slope
379      * estimate.
380      * <p>
381      * The 95% confidence interval is
382      * <p>
383      * <code>(getSlope() - getSlopeConfidenceInterval(),
384      * getSlope() + getSlopeConfidenceInterval())</code>
385      * <p>
386      * If there are fewer that <strong>three</strong> observations in the
387      * model, or if there is no variation in x, this returns
388      * <code>Double.NaN</code>.
389      * <p>
390      * <strong>Usage Note</strong>:<br>
391      * The validity of this statistic depends on the assumption that the
392      * observations included in the model are drawn from a
393      * <a HREF="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
394      * Bivariate Normal Distribution</a>.
395      *
396      * @return half-width of 95% confidence interval for the slope estimate
397      *
398      * @throws MathException if the confidence interval can not be computed.
399      */

400     public double getSlopeConfidenceInterval() throws MathException {
401         return getSlopeConfidenceInterval(0.05d);
402     }
403
404     /**
405      * Returns the half-width of a (100-100*alpha)% confidence interval for
406      * the slope estimate.
407      * <p>
408      * The (100-100*alpha)% confidence interval is
409      * <p>
410      * <code>(getSlope() - getSlopeConfidenceInterval(),
411      * getSlope() + getSlopeConfidenceInterval())</code>
412      * <p>
413      * To request, for example, a 99% confidence interval, use
414      * <code>alpha = .01</code>
415      * <p>
416      * <strong>Usage Note</strong>:<br>
417      * The validity of this statistic depends on the assumption that the
418      * observations included in the model are drawn from a
419      * <a HREF="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
420      * Bivariate Normal Distribution</a>.
421      * <p>
422      * <strong> Preconditions:</strong><ul>
423      * <li>If there are fewer that <strong>three</strong> observations in the
424      * model, or if there is no variation in x, this returns
425      * <code>Double.NaN</code>.
426      * </li>
427      * <li><code>(0 < alpha < 1)</code>; otherwise an
428      * <code>IllegalArgumentException</code> is thrown.
429      * </li></ul>
430      *
431      * @param alpha the desired significance level
432      * @return half-width of 95% confidence interval for the slope estimate
433      * @throws MathException if the confidence interval can not be computed.
434      */

435     public double getSlopeConfidenceInterval(double alpha)
436         throws MathException {
437         if (alpha >= 1 || alpha <= 0) {
438             throw new IllegalArgumentException JavaDoc();
439         }
440         return getSlopeStdErr() *
441             getTDistribution().inverseCumulativeProbability(1d - alpha / 2d);
442     }
443
444     /**
445      * Returns the significance level of the slope (equiv) correlation.
446      * <p>
447      * Specifically, the returned value is the smallest <code>alpha</code>
448      * such that the slope confidence interval with significance level
449      * equal to <code>alpha</code> does not include <code>0</code>.
450      * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
451      * <p>
452      * <strong>Usage Note</strong>:<br>
453      * The validity of this statistic depends on the assumption that the
454      * observations included in the model are drawn from a
455      * <a HREF="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
456      * Bivariate Normal Distribution</a>.
457      * <p>
458      * If there are fewer that <strong>three</strong> observations in the
459      * model, or if there is no variation in x, this returns
460      * <code>Double.NaN</code>.
461      *
462      * @return significance level for slope/correlation
463      * @throws MathException if the significance level can not be computed.
464      */

465     public double getSignificance() throws MathException {
466         return 2d* (1.0 - getTDistribution().cumulativeProbability(
467                     Math.abs(getSlope()) / getSlopeStdErr()));
468     }
469
470     // ---------------------Private methods-----------------------------------
471

472     /**
473     * Returns the intercept of the estimated regression line, given the slope.
474     * <p>
475     * Will return <code>NaN</code> if slope is <code>NaN</code>.
476     *
477     * @param slope current slope
478     * @return the intercept of the regression line
479     */

480     private double getIntercept(double slope) {
481         return (sumY - slope * sumX) / ((double) n);
482     }
483
484     /**
485      * Returns the sum of squared errors associated with the regression
486      * model, using the slope of the regression line.
487      * <p>
488      * Returns NaN if the slope is NaN.
489      *
490      * @param b1 current slope
491      * @return sum of squared errors associated with the regression model
492      */

493     private double getSumSquaredErrors(double b1) {
494         return sumYY - sumXY * sumXY / sumXX;
495     }
496
497     /**
498      * Computes r-square from the slope.
499      * <p>
500      * will return NaN if slope is Nan.
501      *
502      * @param b1 current slope
503      * @return r-square
504      */

505     private double getRSquare(double b1) {
506         double ssto = getTotalSumSquares();
507         return (ssto - getSumSquaredErrors(b1)) / ssto;
508     }
509
510     /**
511      * Computes SSR from b1.
512      *
513      * @param slope regression slope estimate
514      * @return sum of squared deviations of predicted y values
515      */

516     private double getRegressionSumSquares(double slope) {
517         return slope * slope * sumXX;
518     }
519
520     /**
521      * Uses distribution framework to get a t distribution instance
522      * with df = n - 2
523      *
524      * @return t distribution with df = n - 2
525      */

526     private TDistribution getTDistribution() {
527         return DistributionFactory.newInstance().createTDistribution(n - 2);
528     }
529 }
530
Popular Tags