Skip to content

Commit

Permalink
Fix #94 and #95
Browse files Browse the repository at this point in the history
Issues resolved:

1) qrsolve(XX, XY) suffers from numerical inaccuracies on some cases, so
we fix to qrsolve(X, Y) if we don't have weights

(Note: this might be a bit slower as the XX and XY are already
precomputed. It could also be optimized but for now let's leave it as it
is.

2) The methods used to find out omitted variables were different in
different points. When demeaning, we looked if the regressors were close
to zero, but used absolute values (1e-8) which doesn't work if there is
a huge scaling difference between Y and X. Now we will assumme the var
has been absorbed by the absvars if the ratio z'z/w'w < (1e-9) where
z=old variable and w= demeaned variable. This 1e-9 will also increase
with tolerance() as it's just tolerance*1e-1

We also use invsym() as the main driver for whether to drop or not, as
that is what it's used on the built-in tools. Then the inputs of
qrsolve() will exclude the omitted variables, to prevent any issue.
  • Loading branch information
sergiocorreia committed Apr 17, 2017
1 parent 9503eee commit 79a8c01
Show file tree
Hide file tree
Showing 7 changed files with 204 additions and 60 deletions.
3 changes: 2 additions & 1 deletion src/reghdfe.ado
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*! version 4.1.4 17apr2017
*! version 4.2.0 17apr2017

program reghdfe, eclass
* Intercept old+version
Expand Down Expand Up @@ -448,6 +448,7 @@ program RegressOLS, eclass
matrix colnames `b' = `indepvars'
matrix colnames `V' = `indepvars'
matrix rownames `V' = `indepvars'
_ms_findomitted `b' `V'
ereturn post `b' `V', `esample' buildfvinfo depname(`depvar')
}
else {
Expand Down
2 changes: 1 addition & 1 deletion src/reghdfe.pkg
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ d Required packages:
d ftools
d boottest (for Stata version 12 or earlier)
d
d Distribution-Date: 20170406
d Distribution-Date: 20170417
d

f reghdfe.ado
Expand Down
2 changes: 1 addition & 1 deletion src/reghdfe.sthlp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{smcl}
{* *! version 4.1.4 17apr2017}{...}
{* *! version 4.2.0 17apr2017}{...}
{vieweralsosee "[R] areg" "help areg"}{...}
{vieweralsosee "[R] xtreg" "help xtreg"}{...}
{vieweralsosee "[R] ivregress" "help ivregress"}{...}
Expand Down
78 changes: 42 additions & 36 deletions src/reghdfe_class.mata
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class FixedEffects
`StringRowVector' dofadjustments // firstpair pairwise cluster continuous
`Varname' groupvar
`String' residuals
`RowVector' kept // 1 if the regressors are not deemed as omitted (by partial_out+cholsolve+invsym)
`String' diopts
// Output
Expand Down Expand Up @@ -187,7 +188,6 @@ class FixedEffects
// -data- is either a varlist or a matrix
`Variables' y
`Varlist' vars
`RowVector' idx
`Integer' i
if (eltype(data) == "string") {
Expand All @@ -214,13 +214,6 @@ class FixedEffects
assert_msg(cols(y)==cols(vars), "st_data() constructed more columns than expected")
if (timeit) timer_on(52)
idx = `selectindex'(colmax(abs(colminmax(y))) :== 0)
if (timeit) timer_off(52)
if (cols(idx) & verbose>-1) {
printf("{err}WARNING: after demeaning, some variables are just zeros: %s\n", invtokens(vars[idx]))
}
if (timeit) timer_on(53)
_partial_out(y, save_tss)
if (timeit) timer_off(53)
Expand Down Expand Up @@ -306,6 +299,7 @@ class FixedEffects
`Real' y_mean
`Vector' lhs
`Vector' alphas
`StringRowVector' vars
`Integer' i
if (args()<2 | save_tss==.) save_tss = 0
Expand Down Expand Up @@ -349,6 +343,9 @@ class FixedEffects
if (timeit) timer_off(60)
// Compute 2-norm of each var, to see if we need to drop as regressors
kept = diagonal(cross(y, y))'
// Intercept LSMR case
if (acceleration=="lsmr") {
// RRE benchmarking
Expand All @@ -366,45 +363,54 @@ class FixedEffects
}
return
}
else {
// Standardize variables
if (verbose > 0) printf("{txt} - Standardizing variables\n")
if (timeit) timer_on(61)
stdevs = reghdfe_standardize(y)
if (timeit) timer_off(61)
// Standardize variables
if (verbose > 0) printf("{txt} - Standardizing variables\n")
if (timeit) timer_on(61)
stdevs = reghdfe_standardize(y)
if (timeit) timer_off(61)
// RRE benchmarking
rre_true_residual = rre_true_residual / stdevs[1]
if (compute_rre) rre_depvar_norm = norm(y[., 1])
// RRE benchmarking
if (compute_rre) {
rre_true_residual = rre_true_residual / stdevs[1]
rre_depvar_norm = norm(y[., 1])
}
// Solve
if (verbose>0) printf("{txt} - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance)
if (verbose==1) printf("{txt} - Iterating:")
if (verbose>1) printf("{txt} ")
converged = 0
iteration_count = 0
if (timeit) timer_on(62)
y = (*func_accel)(this, y, funct_transform) :* stdevs // this is like python's self
if (timeit) timer_off(62)
// converged gets updated by check_convergence()
if (prune) {
assert(G==2)
if (timeit) timer_on(63)
_expand_1core(y)
if (timeit) timer_off(63)
// Solve
if (verbose>0) printf("{txt} - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance)
if (verbose==1) printf("{txt} - Iterating:")
if (verbose>1) printf("{txt} ")
converged = 0 // converged will get updated by check_convergence()
iteration_count = 0
if (timeit) timer_on(62)
y = (*func_accel)(this, y, funct_transform) :* stdevs // 'this' is like python's self
if (timeit) timer_off(62)
if (prune) {
assert(G==2)
if (timeit) timer_on(63)
_expand_1core(y)
if (timeit) timer_off(63)
}
}
// Standardizing makes it hard to detect values that are perfectly collinear with the absvars
// in which case they should be 0.00 but they end up as 1e-16
// in which case they should be 0.00 but they end up as e.g. 1e-16
// EG: reghdfe price ibn.foreign , absorb(foreign)
// This will edit to zero entire columns where *ALL* values are very close to zero
if (timeit) timer_on(64)
needs_zeroing = colmax(abs(colminmax(y))) :< 1e-8 // chose something relatively close to epsilon() ~ 1e-16
needs_zeroing = `selectindex'(needs_zeroing)
kept = (diagonal(cross(y, y))' :/ kept) :> (tolerance*1e-1)
needs_zeroing = `selectindex'(!kept)
if (cols(needs_zeroing)) {
y[., needs_zeroing] = J(rows(y), cols(needs_zeroing), 0)
vars = tokens(varlist)
assert_msg(cols(vars)==cols(y), "cols(vars)!=cols(y)")
for (i=1; i<=cols(vars); i++) {
if (!kept[i] & verbose>-1) {
printf("{txt}note: omitted %s; close to zero after partialling-out (tol =%3.1e)\n", vars[i], tolerance*1e-1)
}
}
}
if (timeit) timer_off(64)
}
Expand Down
55 changes: 34 additions & 21 deletions src/reghdfe_common.mata
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ mata:
`Variable' resid
`Real' eps
`Integer' i
`StringRowVector' indepvars
if (S.timeit) timer_on(90)
reghdfe_solve_ols(S, X, b=., V=., N=., rank=., df_r=., resid=.)
Expand All @@ -142,15 +141,14 @@ mata:
// Add "o." prefix to omitted regressors
eps = sqrt(epsilon(1))
indepvars = tokens(S.indepvars)
for (i=1; i<=rows(b); i++) {
if (abs(b[i])<eps) {
printf("{txt}note: %s omitted because of collinearity\n", indepvars[i])
stata(sprintf("_ms_put_omit %s", indepvars[i]))
indepvars[i] = st_global("s(ospec)")
if (b[i]==0 & S.verbose > -1) {
printf("{txt}note: %s omitted because of collinearity\n", tokens(S.indepvars)[i])
//stata(sprintf("_ms_put_omit %s", indepvars[i]))
//indepvars[i] = st_global("s(ospec)")
// This is now one in reghdfe.ado with -_ms_findomitted-
}
}
S.indepvars = invtokens(indepvars)
st_matrix(Vname, V)
st_numscalar(nname, N)
Expand Down Expand Up @@ -178,12 +176,11 @@ mata:
`Integer' df_r,
`Vector' resid)
{
// Hack: the first col of X is actually y
// Hack: the first col of X is actually y!
`Integer' K, KK
`Matrix' xx, inv_xx, W, inv_V
`Vector' xy, w
`Integer' used_df_r
`RowVector' kept
if (S.vcetype == "unadjusted" & S.weight_type=="pweight") S.vcetype = "robust"
if (S.verbose > 0) printf("\n{txt} ## Solving least-squares regression of partialled-out variables\n\n")
Expand Down Expand Up @@ -215,8 +212,31 @@ mata:
xy = K ? xx[| 2 , 1 \ K+1 , 1 |] : J(0, 1, .)
xx = K ? xx[| 2 , 2 \ K+1 , K+1 |] : J(0, 0, .)
if (S.timeit) timer_off(91)
b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol?
// This matrix indicates what regressors are not collinear
S.kept = K ? S.kept[2..K+1] : J(1, 0, .)
// Bread of the robust VCV matrix
// Compute this early so we can update the list of collinear regressors
if (S.timeit) timer_on(95)
inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, S.kept)
S.df_m = rank = K - diag0cnt(inv_xx)
KK = rank + S.df_a
S.df_r = N - KK // replaced when clustering
if (S.timeit) timer_off(95)
// Compute betas
if (S.has_weights) {
b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol?
}
else {
// This is more numerically accurate but doesn't handle weights
// See: http://www.stata.com/statalist/archive/2012-02/msg00956.html
b = J(K, 1, 0)
if (cols(S.kept)) {
b[S.kept] = qrsolve(X[., 1:+S.kept], X[., 1])
}
}
if (S.timeit) timer_on(92)
resid = X * (1 \ -b) // y - X * b
Expand All @@ -236,13 +256,6 @@ mata:
S.rss = quadcross(resid, w, resid) // do before reghdfe_robust() modifies w
if (S.timeit) timer_off(94)
// Bread of the robust VCV matrix
if (S.timeit) timer_on(95)
inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, kept=.) // invsym(xx, 1..K)
S.df_m = rank = K - diag0cnt(inv_xx)
KK = rank + S.df_a
S.df_r = N - KK // replaced when clustering
if (S.timeit) timer_off(95)
// Compute full VCE
if (S.timeit) timer_on(96)
Expand All @@ -263,16 +276,16 @@ mata:
// Wald test: joint significance
if (S.timeit) timer_on(97)
inv_V = invsym(V[kept, kept]) // this might not be of full rank but numerical inaccuracies hide it
inv_V = invsym(V[S.kept, S.kept]) // this might not be of full rank but numerical inaccuracies hide it
if (diag0cnt(inv_V)) {
if (S.verbose > -1) printf("{txt}(Warning: missing F statistic; dropped variables due to collinearity or too few clusters)\n")
W = .
}
else if (length(b[kept])==0) {
else if (length(b[S.kept])==0) {
W = .
}
else {
W = b[kept]' * inv_V * b[kept] / S.df_m
W = b[S.kept]' * inv_V * b[S.kept] / S.df_m
if (missing(W) & S.verbose > -1) printf("{txt}(Warning: missing F statistic)\n")
}
if (S.timeit) timer_off(97)
Expand Down
123 changes: 123 additions & 0 deletions test/omitted.do
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
noi cscript "reghdfe: ensure we omit the correct variables" adofile reghdfe

* Setup
local included_e ///
scalar: N rmse tss rss r2 r2_a df_r df_m ll ll_0 /// mss F
matrix: trim_b trim_V ///
macros: wexp wtype

* [TEST] Simplest case
sysuse auto, clear
gen z = 0
reghdfe price z, noab
matrix b = e(b)
assert b[1,1]==0
matrix list b
_ms_omit_info b
assert r(k_omit)==1


* [TEST] Simple case
* https://github.com/sergiocorreia/reghdfe/issues/93

sysuse auto, clear
gen lowrep = rep78<4
loc lhs price
loc rhs lowrep
loc absvars rep78

* 1. Run benchmark
areg `lhs' `rhs', absorb(`absvars')
matrix list e(b)

trim_cons
local bench_df_a = e(df_a)
storedresults save benchmark e()

* 2. Run reghdfe
reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
matrix list e(b)
notrim
storedresults compare benchmark e(), tol(1e-12) include(`included_e')
assert `bench_df_a'==e(df_a)-1

* [TEST] https://github.com/sergiocorreia/reghdfe/issues/95
clear
set obs 100
set seed 123
gen double y = runiform() * 1e8
gen double z = runiform()
gen byte c = 1

loc lhs z
loc rhs y
loc absvars c

* 1. Run benchmark
areg `lhs' `rhs', absorb(`absvars')
matrix list e(b)

trim_cons
local bench_df_a = e(df_a)
storedresults save benchmark e()

* 2. Run reghdfe
reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
matrix list e(b)
notrim
storedresults compare benchmark e(), tol(1e-12) include(`included_e')
assert `bench_df_a'==e(df_a)-1


* [TEST] https://github.com/sergiocorreia/reghdfe/issues/94
clear
input w x y unity
w x y unity
1 -1.10565 -1.223344 1
1 -1.109139 -1.127615 1
0 -.8497003 -1.207231 1
1 -.2734231 -1.294683 1
0 .0063879 -.8343916 1
1 .7669702 -.8245103 1
end

gen z = 0
reghdfe y z, noab

reghdfe y i.w##i.unity x, a(unity) tol(1e-12)
matrix list e(b)
matrix list e(V)

areg y i.w##i.unity x, a(unity)
matrix list e(b)
matrix list e(V)


loc lhs y
loc rhs i.w##i.unity x
loc absvars unity

* 1. Run benchmark
areg `lhs' `rhs', absorb(`absvars')
matrix list e(b)

trim_cons
local bench_df_a = e(df_a)
storedresults save benchmark e()

* 2. Run reghdfe
reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
matrix list e(b)
notrim
matrix b = e(b)
matrix V = e(V)
assert abs(SR__benchmark_trim_b[1,2] - b[1,1]) < 1e-8
assert abs(SR__benchmark_trim_b[1,6] - b[1,4]) < 1e-8
assert abs(SR__benchmark_trim_V[2,2] - V[1,1]) < 1e-8
assert abs(SR__benchmark_trim_V[6,6] - V[4,4]) < 1e-8

* Done!

storedresults drop benchmark
clear matrix
exit
Loading

0 comments on commit 79a8c01

Please sign in to comment.