From 79a8c0134ea089f93811be404a5405c38b5a596a Mon Sep 17 00:00:00 2001 From: Sergio Correia Date: Mon, 17 Apr 2017 18:25:34 -0400 Subject: [PATCH] Fix #94 and #95 Issues resolved: 1) qrsolve(XX, XY) suffers from numerical inaccuracies on some cases, so we fix to qrsolve(X, Y) if we don't have weights (Note: this might be a bit slower as the XX and XY are already precomputed. It could also be optimized but for now let's leave it as it is. 2) The methods used to find out omitted variables were different in different points. When demeaning, we looked if the regressors were close to zero, but used absolute values (1e-8) which doesn't work if there is a huge scaling difference between Y and X. Now we will assumme the var has been absorbed by the absvars if the ratio z'z/w'w < (1e-9) where z=old variable and w= demeaned variable. This 1e-9 will also increase with tolerance() as it's just tolerance*1e-1 We also use invsym() as the main driver for whether to drop or not, as that is what it's used on the built-in tools. Then the inputs of qrsolve() will exclude the omitted variables, to prevent any issue. --- src/reghdfe.ado | 3 +- src/reghdfe.pkg | 2 +- src/reghdfe.sthlp | 2 +- src/reghdfe_class.mata | 78 +++++++++++++------------ src/reghdfe_common.mata | 55 +++++++++++------- test/omitted.do | 123 ++++++++++++++++++++++++++++++++++++++++ test/testall.do | 1 + 7 files changed, 204 insertions(+), 60 deletions(-) create mode 100644 test/omitted.do diff --git a/src/reghdfe.ado b/src/reghdfe.ado index 40a7e15..9bf6666 100644 --- a/src/reghdfe.ado +++ b/src/reghdfe.ado @@ -1,4 +1,4 @@ -*! version 4.1.4 17apr2017 +*! version 4.2.0 17apr2017 program reghdfe, eclass * Intercept old+version @@ -448,6 +448,7 @@ program RegressOLS, eclass matrix colnames `b' = `indepvars' matrix colnames `V' = `indepvars' matrix rownames `V' = `indepvars' + _ms_findomitted `b' `V' ereturn post `b' `V', `esample' buildfvinfo depname(`depvar') } else { diff --git a/src/reghdfe.pkg b/src/reghdfe.pkg index b76d17a..1c8d81a 100644 --- a/src/reghdfe.pkg +++ b/src/reghdfe.pkg @@ -29,7 +29,7 @@ d Required packages: d ftools d boottest (for Stata version 12 or earlier) d -d Distribution-Date: 20170406 +d Distribution-Date: 20170417 d f reghdfe.ado diff --git a/src/reghdfe.sthlp b/src/reghdfe.sthlp index 6a8266f..8663cfe 100644 --- a/src/reghdfe.sthlp +++ b/src/reghdfe.sthlp @@ -1,5 +1,5 @@ {smcl} -{* *! version 4.1.4 17apr2017}{...} +{* *! version 4.2.0 17apr2017}{...} {vieweralsosee "[R] areg" "help areg"}{...} {vieweralsosee "[R] xtreg" "help xtreg"}{...} {vieweralsosee "[R] ivregress" "help ivregress"}{...} diff --git a/src/reghdfe_class.mata b/src/reghdfe_class.mata index 3d4682f..889fc94 100644 --- a/src/reghdfe_class.mata +++ b/src/reghdfe_class.mata @@ -102,6 +102,7 @@ class FixedEffects `StringRowVector' dofadjustments // firstpair pairwise cluster continuous `Varname' groupvar `String' residuals + `RowVector' kept // 1 if the regressors are not deemed as omitted (by partial_out+cholsolve+invsym) `String' diopts // Output @@ -187,7 +188,6 @@ class FixedEffects // -data- is either a varlist or a matrix `Variables' y `Varlist' vars - `RowVector' idx `Integer' i if (eltype(data) == "string") { @@ -214,13 +214,6 @@ class FixedEffects assert_msg(cols(y)==cols(vars), "st_data() constructed more columns than expected") - if (timeit) timer_on(52) - idx = `selectindex'(colmax(abs(colminmax(y))) :== 0) - if (timeit) timer_off(52) - if (cols(idx) & verbose>-1) { - printf("{err}WARNING: after demeaning, some variables are just zeros: %s\n", invtokens(vars[idx])) - } - if (timeit) timer_on(53) _partial_out(y, save_tss) if (timeit) timer_off(53) @@ -306,6 +299,7 @@ class FixedEffects `Real' y_mean `Vector' lhs `Vector' alphas + `StringRowVector' vars `Integer' i if (args()<2 | save_tss==.) save_tss = 0 @@ -349,6 +343,9 @@ class FixedEffects if (timeit) timer_off(60) + // Compute 2-norm of each var, to see if we need to drop as regressors + kept = diagonal(cross(y, y))' + // Intercept LSMR case if (acceleration=="lsmr") { // RRE benchmarking @@ -366,45 +363,54 @@ class FixedEffects } return } + else { + // Standardize variables + if (verbose > 0) printf("{txt} - Standardizing variables\n") + if (timeit) timer_on(61) + stdevs = reghdfe_standardize(y) + if (timeit) timer_off(61) - // Standardize variables - if (verbose > 0) printf("{txt} - Standardizing variables\n") - if (timeit) timer_on(61) - stdevs = reghdfe_standardize(y) - if (timeit) timer_off(61) - - // RRE benchmarking - rre_true_residual = rre_true_residual / stdevs[1] - if (compute_rre) rre_depvar_norm = norm(y[., 1]) + // RRE benchmarking + if (compute_rre) { + rre_true_residual = rre_true_residual / stdevs[1] + rre_depvar_norm = norm(y[., 1]) + } - // Solve - if (verbose>0) printf("{txt} - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance) - if (verbose==1) printf("{txt} - Iterating:") - if (verbose>1) printf("{txt} ") - converged = 0 - iteration_count = 0 - if (timeit) timer_on(62) - y = (*func_accel)(this, y, funct_transform) :* stdevs // this is like python's self - if (timeit) timer_off(62) - // converged gets updated by check_convergence() - - if (prune) { - assert(G==2) - if (timeit) timer_on(63) - _expand_1core(y) - if (timeit) timer_off(63) + // Solve + if (verbose>0) printf("{txt} - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance) + if (verbose==1) printf("{txt} - Iterating:") + if (verbose>1) printf("{txt} ") + converged = 0 // converged will get updated by check_convergence() + iteration_count = 0 + if (timeit) timer_on(62) + y = (*func_accel)(this, y, funct_transform) :* stdevs // 'this' is like python's self + if (timeit) timer_off(62) + + if (prune) { + assert(G==2) + if (timeit) timer_on(63) + _expand_1core(y) + if (timeit) timer_off(63) + } } // Standardizing makes it hard to detect values that are perfectly collinear with the absvars - // in which case they should be 0.00 but they end up as 1e-16 + // in which case they should be 0.00 but they end up as e.g. 1e-16 // EG: reghdfe price ibn.foreign , absorb(foreign) // This will edit to zero entire columns where *ALL* values are very close to zero if (timeit) timer_on(64) - needs_zeroing = colmax(abs(colminmax(y))) :< 1e-8 // chose something relatively close to epsilon() ~ 1e-16 - needs_zeroing = `selectindex'(needs_zeroing) + kept = (diagonal(cross(y, y))' :/ kept) :> (tolerance*1e-1) + needs_zeroing = `selectindex'(!kept) if (cols(needs_zeroing)) { y[., needs_zeroing] = J(rows(y), cols(needs_zeroing), 0) + vars = tokens(varlist) + assert_msg(cols(vars)==cols(y), "cols(vars)!=cols(y)") + for (i=1; i<=cols(vars); i++) { + if (!kept[i] & verbose>-1) { + printf("{txt}note: omitted %s; close to zero after partialling-out (tol =%3.1e)\n", vars[i], tolerance*1e-1) + } + } } if (timeit) timer_off(64) } diff --git a/src/reghdfe_common.mata b/src/reghdfe_common.mata index 7d9a28b..e113608 100644 --- a/src/reghdfe_common.mata +++ b/src/reghdfe_common.mata @@ -132,7 +132,6 @@ mata: `Variable' resid `Real' eps `Integer' i - `StringRowVector' indepvars if (S.timeit) timer_on(90) reghdfe_solve_ols(S, X, b=., V=., N=., rank=., df_r=., resid=.) @@ -142,15 +141,14 @@ mata: // Add "o." prefix to omitted regressors eps = sqrt(epsilon(1)) - indepvars = tokens(S.indepvars) for (i=1; i<=rows(b); i++) { - if (abs(b[i]) -1) { + printf("{txt}note: %s omitted because of collinearity\n", tokens(S.indepvars)[i]) + //stata(sprintf("_ms_put_omit %s", indepvars[i])) + //indepvars[i] = st_global("s(ospec)") + // This is now one in reghdfe.ado with -_ms_findomitted- } } - S.indepvars = invtokens(indepvars) st_matrix(Vname, V) st_numscalar(nname, N) @@ -178,12 +176,11 @@ mata: `Integer' df_r, `Vector' resid) { - // Hack: the first col of X is actually y + // Hack: the first col of X is actually y! `Integer' K, KK `Matrix' xx, inv_xx, W, inv_V `Vector' xy, w `Integer' used_df_r - `RowVector' kept if (S.vcetype == "unadjusted" & S.weight_type=="pweight") S.vcetype = "robust" if (S.verbose > 0) printf("\n{txt} ## Solving least-squares regression of partialled-out variables\n\n") @@ -215,8 +212,31 @@ mata: xy = K ? xx[| 2 , 1 \ K+1 , 1 |] : J(0, 1, .) xx = K ? xx[| 2 , 2 \ K+1 , K+1 |] : J(0, 0, .) if (S.timeit) timer_off(91) - - b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol? + + // This matrix indicates what regressors are not collinear + S.kept = K ? S.kept[2..K+1] : J(1, 0, .) + + // Bread of the robust VCV matrix + // Compute this early so we can update the list of collinear regressors + if (S.timeit) timer_on(95) + inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, S.kept) + S.df_m = rank = K - diag0cnt(inv_xx) + KK = rank + S.df_a + S.df_r = N - KK // replaced when clustering + if (S.timeit) timer_off(95) + + // Compute betas + if (S.has_weights) { + b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol? + } + else { + // This is more numerically accurate but doesn't handle weights + // See: http://www.stata.com/statalist/archive/2012-02/msg00956.html + b = J(K, 1, 0) + if (cols(S.kept)) { + b[S.kept] = qrsolve(X[., 1:+S.kept], X[., 1]) + } + } if (S.timeit) timer_on(92) resid = X * (1 \ -b) // y - X * b @@ -236,13 +256,6 @@ mata: S.rss = quadcross(resid, w, resid) // do before reghdfe_robust() modifies w if (S.timeit) timer_off(94) - // Bread of the robust VCV matrix - if (S.timeit) timer_on(95) - inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, kept=.) // invsym(xx, 1..K) - S.df_m = rank = K - diag0cnt(inv_xx) - KK = rank + S.df_a - S.df_r = N - KK // replaced when clustering - if (S.timeit) timer_off(95) // Compute full VCE if (S.timeit) timer_on(96) @@ -263,16 +276,16 @@ mata: // Wald test: joint significance if (S.timeit) timer_on(97) - inv_V = invsym(V[kept, kept]) // this might not be of full rank but numerical inaccuracies hide it + inv_V = invsym(V[S.kept, S.kept]) // this might not be of full rank but numerical inaccuracies hide it if (diag0cnt(inv_V)) { if (S.verbose > -1) printf("{txt}(Warning: missing F statistic; dropped variables due to collinearity or too few clusters)\n") W = . } - else if (length(b[kept])==0) { + else if (length(b[S.kept])==0) { W = . } else { - W = b[kept]' * inv_V * b[kept] / S.df_m + W = b[S.kept]' * inv_V * b[S.kept] / S.df_m if (missing(W) & S.verbose > -1) printf("{txt}(Warning: missing F statistic)\n") } if (S.timeit) timer_off(97) diff --git a/test/omitted.do b/test/omitted.do new file mode 100644 index 0000000..2dd1135 --- /dev/null +++ b/test/omitted.do @@ -0,0 +1,123 @@ +noi cscript "reghdfe: ensure we omit the correct variables" adofile reghdfe + +* Setup + local included_e /// + scalar: N rmse tss rss r2 r2_a df_r df_m ll ll_0 /// mss F + matrix: trim_b trim_V /// + macros: wexp wtype + +* [TEST] Simplest case + sysuse auto, clear + gen z = 0 + reghdfe price z, noab + matrix b = e(b) + assert b[1,1]==0 + matrix list b + _ms_omit_info b + assert r(k_omit)==1 + + +* [TEST] Simple case +* https://github.com/sergiocorreia/reghdfe/issues/93 + + sysuse auto, clear + gen lowrep = rep78<4 + loc lhs price + loc rhs lowrep + loc absvars rep78 + + * 1. Run benchmark + areg `lhs' `rhs', absorb(`absvars') + matrix list e(b) + + trim_cons + local bench_df_a = e(df_a) + storedresults save benchmark e() + + * 2. Run reghdfe + reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1) + matrix list e(b) + notrim + storedresults compare benchmark e(), tol(1e-12) include(`included_e') + assert `bench_df_a'==e(df_a)-1 + +* [TEST] https://github.com/sergiocorreia/reghdfe/issues/95 + clear + set obs 100 + set seed 123 + gen double y = runiform() * 1e8 + gen double z = runiform() + gen byte c = 1 + + loc lhs z + loc rhs y + loc absvars c + + * 1. Run benchmark + areg `lhs' `rhs', absorb(`absvars') + matrix list e(b) + + trim_cons + local bench_df_a = e(df_a) + storedresults save benchmark e() + + * 2. Run reghdfe + reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1) + matrix list e(b) + notrim + storedresults compare benchmark e(), tol(1e-12) include(`included_e') + assert `bench_df_a'==e(df_a)-1 + + +* [TEST] https://github.com/sergiocorreia/reghdfe/issues/94 + clear +input w x y unity +w x y unity +1 -1.10565 -1.223344 1 +1 -1.109139 -1.127615 1 +0 -.8497003 -1.207231 1 +1 -.2734231 -1.294683 1 +0 .0063879 -.8343916 1 +1 .7669702 -.8245103 1 +end + + gen z = 0 + reghdfe y z, noab + + reghdfe y i.w##i.unity x, a(unity) tol(1e-12) + matrix list e(b) + matrix list e(V) + + areg y i.w##i.unity x, a(unity) + matrix list e(b) + matrix list e(V) + + + loc lhs y + loc rhs i.w##i.unity x + loc absvars unity + + * 1. Run benchmark + areg `lhs' `rhs', absorb(`absvars') + matrix list e(b) + + trim_cons + local bench_df_a = e(df_a) + storedresults save benchmark e() + + * 2. Run reghdfe + reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1) + matrix list e(b) + notrim + matrix b = e(b) + matrix V = e(V) + assert abs(SR__benchmark_trim_b[1,2] - b[1,1]) < 1e-8 + assert abs(SR__benchmark_trim_b[1,6] - b[1,4]) < 1e-8 + assert abs(SR__benchmark_trim_V[2,2] - V[1,1]) < 1e-8 + assert abs(SR__benchmark_trim_V[6,6] - V[4,4]) < 1e-8 + + * Done! + +storedresults drop benchmark +clear matrix +exit diff --git a/test/testall.do b/test/testall.do index 5ed6aef..1306d28 100644 --- a/test/testall.do +++ b/test/testall.do @@ -22,6 +22,7 @@ run missing-absvar run options + run omitted run alphas cap run cluster-after-expand