diff --git a/src/reghdfe.ado b/src/reghdfe.ado index 40a7e15..9bf6666 100644 --- a/src/reghdfe.ado +++ b/src/reghdfe.ado @@ -1,4 +1,4 @@ -*! version 4.1.4 17apr2017 +*! version 4.2.0 17apr2017 program reghdfe, eclass * Intercept old+version @@ -448,6 +448,7 @@ program RegressOLS, eclass matrix colnames `b' = `indepvars' matrix colnames `V' = `indepvars' matrix rownames `V' = `indepvars' + _ms_findomitted `b' `V' ereturn post `b' `V', `esample' buildfvinfo depname(`depvar') } else { diff --git a/src/reghdfe.pkg b/src/reghdfe.pkg index b76d17a..1c8d81a 100644 --- a/src/reghdfe.pkg +++ b/src/reghdfe.pkg @@ -29,7 +29,7 @@ d Required packages: d ftools d boottest (for Stata version 12 or earlier) d -d Distribution-Date: 20170406 +d Distribution-Date: 20170417 d f reghdfe.ado diff --git a/src/reghdfe.sthlp b/src/reghdfe.sthlp index 6a8266f..8663cfe 100644 --- a/src/reghdfe.sthlp +++ b/src/reghdfe.sthlp @@ -1,5 +1,5 @@ {smcl} -{* *! version 4.1.4 17apr2017}{...} +{* *! version 4.2.0 17apr2017}{...} {vieweralsosee "[R] areg" "help areg"}{...} {vieweralsosee "[R] xtreg" "help xtreg"}{...} {vieweralsosee "[R] ivregress" "help ivregress"}{...} diff --git a/src/reghdfe_class.mata b/src/reghdfe_class.mata index 3d4682f..889fc94 100644 --- a/src/reghdfe_class.mata +++ b/src/reghdfe_class.mata @@ -102,6 +102,7 @@ class FixedEffects `StringRowVector' dofadjustments // firstpair pairwise cluster continuous `Varname' groupvar `String' residuals + `RowVector' kept // 1 if the regressors are not deemed as omitted (by partial_out+cholsolve+invsym) `String' diopts // Output @@ -187,7 +188,6 @@ class FixedEffects // -data- is either a varlist or a matrix `Variables' y `Varlist' vars - `RowVector' idx `Integer' i if (eltype(data) == "string") { @@ -214,13 +214,6 @@ class FixedEffects assert_msg(cols(y)==cols(vars), "st_data() constructed more columns than expected") - if (timeit) timer_on(52) - idx = `selectindex'(colmax(abs(colminmax(y))) :== 0) - if (timeit) timer_off(52) - if (cols(idx) & verbose>-1) { - printf("{err}WARNING: after demeaning, some variables are just zeros: %s\n", invtokens(vars[idx])) - } - if (timeit) timer_on(53) _partial_out(y, save_tss) if (timeit) timer_off(53) @@ -306,6 +299,7 @@ class FixedEffects `Real' y_mean `Vector' lhs `Vector' alphas + `StringRowVector' vars `Integer' i if (args()<2 | save_tss==.) save_tss = 0 @@ -349,6 +343,9 @@ class FixedEffects if (timeit) timer_off(60) + // Compute 2-norm of each var, to see if we need to drop as regressors + kept = diagonal(cross(y, y))' + // Intercept LSMR case if (acceleration=="lsmr") { // RRE benchmarking @@ -366,45 +363,54 @@ class FixedEffects } return } + else { + // Standardize variables + if (verbose > 0) printf("{txt} - Standardizing variables\n") + if (timeit) timer_on(61) + stdevs = reghdfe_standardize(y) + if (timeit) timer_off(61) - // Standardize variables - if (verbose > 0) printf("{txt} - Standardizing variables\n") - if (timeit) timer_on(61) - stdevs = reghdfe_standardize(y) - if (timeit) timer_off(61) - - // RRE benchmarking - rre_true_residual = rre_true_residual / stdevs[1] - if (compute_rre) rre_depvar_norm = norm(y[., 1]) + // RRE benchmarking + if (compute_rre) { + rre_true_residual = rre_true_residual / stdevs[1] + rre_depvar_norm = norm(y[., 1]) + } - // Solve - if (verbose>0) printf("{txt} - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance) - if (verbose==1) printf("{txt} - Iterating:") - if (verbose>1) printf("{txt} ") - converged = 0 - iteration_count = 0 - if (timeit) timer_on(62) - y = (*func_accel)(this, y, funct_transform) :* stdevs // this is like python's self - if (timeit) timer_off(62) - // converged gets updated by check_convergence() - - if (prune) { - assert(G==2) - if (timeit) timer_on(63) - _expand_1core(y) - if (timeit) timer_off(63) + // Solve + if (verbose>0) printf("{txt} - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance) + if (verbose==1) printf("{txt} - Iterating:") + if (verbose>1) printf("{txt} ") + converged = 0 // converged will get updated by check_convergence() + iteration_count = 0 + if (timeit) timer_on(62) + y = (*func_accel)(this, y, funct_transform) :* stdevs // 'this' is like python's self + if (timeit) timer_off(62) + + if (prune) { + assert(G==2) + if (timeit) timer_on(63) + _expand_1core(y) + if (timeit) timer_off(63) + } } // Standardizing makes it hard to detect values that are perfectly collinear with the absvars - // in which case they should be 0.00 but they end up as 1e-16 + // in which case they should be 0.00 but they end up as e.g. 1e-16 // EG: reghdfe price ibn.foreign , absorb(foreign) // This will edit to zero entire columns where *ALL* values are very close to zero if (timeit) timer_on(64) - needs_zeroing = colmax(abs(colminmax(y))) :< 1e-8 // chose something relatively close to epsilon() ~ 1e-16 - needs_zeroing = `selectindex'(needs_zeroing) + kept = (diagonal(cross(y, y))' :/ kept) :> (tolerance*1e-1) + needs_zeroing = `selectindex'(!kept) if (cols(needs_zeroing)) { y[., needs_zeroing] = J(rows(y), cols(needs_zeroing), 0) + vars = tokens(varlist) + assert_msg(cols(vars)==cols(y), "cols(vars)!=cols(y)") + for (i=1; i<=cols(vars); i++) { + if (!kept[i] & verbose>-1) { + printf("{txt}note: omitted %s; close to zero after partialling-out (tol =%3.1e)\n", vars[i], tolerance*1e-1) + } + } } if (timeit) timer_off(64) } diff --git a/src/reghdfe_common.mata b/src/reghdfe_common.mata index 7d9a28b..e113608 100644 --- a/src/reghdfe_common.mata +++ b/src/reghdfe_common.mata @@ -132,7 +132,6 @@ mata: `Variable' resid `Real' eps `Integer' i - `StringRowVector' indepvars if (S.timeit) timer_on(90) reghdfe_solve_ols(S, X, b=., V=., N=., rank=., df_r=., resid=.) @@ -142,15 +141,14 @@ mata: // Add "o." prefix to omitted regressors eps = sqrt(epsilon(1)) - indepvars = tokens(S.indepvars) for (i=1; i<=rows(b); i++) { - if (abs(b[i]) -1) { + printf("{txt}note: %s omitted because of collinearity\n", tokens(S.indepvars)[i]) + //stata(sprintf("_ms_put_omit %s", indepvars[i])) + //indepvars[i] = st_global("s(ospec)") + // This is now one in reghdfe.ado with -_ms_findomitted- } } - S.indepvars = invtokens(indepvars) st_matrix(Vname, V) st_numscalar(nname, N) @@ -178,12 +176,11 @@ mata: `Integer' df_r, `Vector' resid) { - // Hack: the first col of X is actually y + // Hack: the first col of X is actually y! `Integer' K, KK `Matrix' xx, inv_xx, W, inv_V `Vector' xy, w `Integer' used_df_r - `RowVector' kept if (S.vcetype == "unadjusted" & S.weight_type=="pweight") S.vcetype = "robust" if (S.verbose > 0) printf("\n{txt} ## Solving least-squares regression of partialled-out variables\n\n") @@ -215,8 +212,31 @@ mata: xy = K ? xx[| 2 , 1 \ K+1 , 1 |] : J(0, 1, .) xx = K ? xx[| 2 , 2 \ K+1 , K+1 |] : J(0, 0, .) if (S.timeit) timer_off(91) - - b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol? + + // This matrix indicates what regressors are not collinear + S.kept = K ? S.kept[2..K+1] : J(1, 0, .) + + // Bread of the robust VCV matrix + // Compute this early so we can update the list of collinear regressors + if (S.timeit) timer_on(95) + inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, S.kept) + S.df_m = rank = K - diag0cnt(inv_xx) + KK = rank + S.df_a + S.df_r = N - KK // replaced when clustering + if (S.timeit) timer_off(95) + + // Compute betas + if (S.has_weights) { + b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol? + } + else { + // This is more numerically accurate but doesn't handle weights + // See: http://www.stata.com/statalist/archive/2012-02/msg00956.html + b = J(K, 1, 0) + if (cols(S.kept)) { + b[S.kept] = qrsolve(X[., 1:+S.kept], X[., 1]) + } + } if (S.timeit) timer_on(92) resid = X * (1 \ -b) // y - X * b @@ -236,13 +256,6 @@ mata: S.rss = quadcross(resid, w, resid) // do before reghdfe_robust() modifies w if (S.timeit) timer_off(94) - // Bread of the robust VCV matrix - if (S.timeit) timer_on(95) - inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, kept=.) // invsym(xx, 1..K) - S.df_m = rank = K - diag0cnt(inv_xx) - KK = rank + S.df_a - S.df_r = N - KK // replaced when clustering - if (S.timeit) timer_off(95) // Compute full VCE if (S.timeit) timer_on(96) @@ -263,16 +276,16 @@ mata: // Wald test: joint significance if (S.timeit) timer_on(97) - inv_V = invsym(V[kept, kept]) // this might not be of full rank but numerical inaccuracies hide it + inv_V = invsym(V[S.kept, S.kept]) // this might not be of full rank but numerical inaccuracies hide it if (diag0cnt(inv_V)) { if (S.verbose > -1) printf("{txt}(Warning: missing F statistic; dropped variables due to collinearity or too few clusters)\n") W = . } - else if (length(b[kept])==0) { + else if (length(b[S.kept])==0) { W = . } else { - W = b[kept]' * inv_V * b[kept] / S.df_m + W = b[S.kept]' * inv_V * b[S.kept] / S.df_m if (missing(W) & S.verbose > -1) printf("{txt}(Warning: missing F statistic)\n") } if (S.timeit) timer_off(97) diff --git a/test/omitted.do b/test/omitted.do new file mode 100644 index 0000000..2dd1135 --- /dev/null +++ b/test/omitted.do @@ -0,0 +1,123 @@ +noi cscript "reghdfe: ensure we omit the correct variables" adofile reghdfe + +* Setup + local included_e /// + scalar: N rmse tss rss r2 r2_a df_r df_m ll ll_0 /// mss F + matrix: trim_b trim_V /// + macros: wexp wtype + +* [TEST] Simplest case + sysuse auto, clear + gen z = 0 + reghdfe price z, noab + matrix b = e(b) + assert b[1,1]==0 + matrix list b + _ms_omit_info b + assert r(k_omit)==1 + + +* [TEST] Simple case +* https://github.com/sergiocorreia/reghdfe/issues/93 + + sysuse auto, clear + gen lowrep = rep78<4 + loc lhs price + loc rhs lowrep + loc absvars rep78 + + * 1. Run benchmark + areg `lhs' `rhs', absorb(`absvars') + matrix list e(b) + + trim_cons + local bench_df_a = e(df_a) + storedresults save benchmark e() + + * 2. Run reghdfe + reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1) + matrix list e(b) + notrim + storedresults compare benchmark e(), tol(1e-12) include(`included_e') + assert `bench_df_a'==e(df_a)-1 + +* [TEST] https://github.com/sergiocorreia/reghdfe/issues/95 + clear + set obs 100 + set seed 123 + gen double y = runiform() * 1e8 + gen double z = runiform() + gen byte c = 1 + + loc lhs z + loc rhs y + loc absvars c + + * 1. Run benchmark + areg `lhs' `rhs', absorb(`absvars') + matrix list e(b) + + trim_cons + local bench_df_a = e(df_a) + storedresults save benchmark e() + + * 2. Run reghdfe + reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1) + matrix list e(b) + notrim + storedresults compare benchmark e(), tol(1e-12) include(`included_e') + assert `bench_df_a'==e(df_a)-1 + + +* [TEST] https://github.com/sergiocorreia/reghdfe/issues/94 + clear +input w x y unity +w x y unity +1 -1.10565 -1.223344 1 +1 -1.109139 -1.127615 1 +0 -.8497003 -1.207231 1 +1 -.2734231 -1.294683 1 +0 .0063879 -.8343916 1 +1 .7669702 -.8245103 1 +end + + gen z = 0 + reghdfe y z, noab + + reghdfe y i.w##i.unity x, a(unity) tol(1e-12) + matrix list e(b) + matrix list e(V) + + areg y i.w##i.unity x, a(unity) + matrix list e(b) + matrix list e(V) + + + loc lhs y + loc rhs i.w##i.unity x + loc absvars unity + + * 1. Run benchmark + areg `lhs' `rhs', absorb(`absvars') + matrix list e(b) + + trim_cons + local bench_df_a = e(df_a) + storedresults save benchmark e() + + * 2. Run reghdfe + reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1) + matrix list e(b) + notrim + matrix b = e(b) + matrix V = e(V) + assert abs(SR__benchmark_trim_b[1,2] - b[1,1]) < 1e-8 + assert abs(SR__benchmark_trim_b[1,6] - b[1,4]) < 1e-8 + assert abs(SR__benchmark_trim_V[2,2] - V[1,1]) < 1e-8 + assert abs(SR__benchmark_trim_V[6,6] - V[4,4]) < 1e-8 + + * Done! + +storedresults drop benchmark +clear matrix +exit diff --git a/test/testall.do b/test/testall.do index 5ed6aef..1306d28 100644 --- a/test/testall.do +++ b/test/testall.do @@ -22,6 +22,7 @@ run missing-absvar run options + run omitted run alphas cap run cluster-after-expand