From 79a8c0134ea089f93811be404a5405c38b5a596a Mon Sep 17 00:00:00 2001
From: Sergio Correia <sergio.correia@gmail.com>
Date: Mon, 17 Apr 2017 18:25:34 -0400
Subject: [PATCH] Fix #94 and #95

Issues resolved:

1) qrsolve(XX, XY) suffers from numerical inaccuracies on some cases, so
we fix to qrsolve(X, Y) if we don't have weights

(Note: this might be a bit slower as the XX and XY are already
precomputed. It could also be optimized but for now let's leave it as it
is.

2) The methods used to find out omitted variables were different in
different points. When demeaning, we looked if the regressors were close
to zero, but used absolute values (1e-8) which doesn't work if there is
a huge scaling difference between Y and X. Now we will assumme the var
has been absorbed by the absvars if the ratio z'z/w'w < (1e-9) where
z=old variable and w= demeaned variable. This 1e-9 will also increase
with tolerance() as it's just tolerance*1e-1

We also use invsym() as the main driver for whether to drop or not, as
that is what it's used on the built-in tools. Then the inputs of
qrsolve() will exclude the omitted variables, to prevent any issue.
---
 src/reghdfe.ado         |   3 +-
 src/reghdfe.pkg         |   2 +-
 src/reghdfe.sthlp       |   2 +-
 src/reghdfe_class.mata  |  78 +++++++++++++------------
 src/reghdfe_common.mata |  55 +++++++++++-------
 test/omitted.do         | 123 ++++++++++++++++++++++++++++++++++++++++
 test/testall.do         |   1 +
 7 files changed, 204 insertions(+), 60 deletions(-)
 create mode 100644 test/omitted.do

diff --git a/src/reghdfe.ado b/src/reghdfe.ado
index 40a7e15..9bf6666 100644
--- a/src/reghdfe.ado
+++ b/src/reghdfe.ado
@@ -1,4 +1,4 @@
-*! version 4.1.4 17apr2017
+*! version 4.2.0 17apr2017
 
 program reghdfe, eclass
 	* Intercept old+version
@@ -448,6 +448,7 @@ program RegressOLS, eclass
 		matrix colnames `b' = `indepvars'
 		matrix colnames `V' = `indepvars'
 		matrix rownames `V' = `indepvars'
+		_ms_findomitted `b' `V'
 		ereturn post `b' `V', `esample' buildfvinfo depname(`depvar') 
 	}
 	else {
diff --git a/src/reghdfe.pkg b/src/reghdfe.pkg
index b76d17a..1c8d81a 100644
--- a/src/reghdfe.pkg
+++ b/src/reghdfe.pkg
@@ -29,7 +29,7 @@ d Required packages:
 d 	ftools
 d 	boottest (for Stata version 12 or earlier)
 d
-d Distribution-Date: 20170406
+d Distribution-Date: 20170417
 d
 
 f reghdfe.ado
diff --git a/src/reghdfe.sthlp b/src/reghdfe.sthlp
index 6a8266f..8663cfe 100644
--- a/src/reghdfe.sthlp
+++ b/src/reghdfe.sthlp
@@ -1,5 +1,5 @@
 {smcl}
-{* *! version 4.1.4 17apr2017}{...}
+{* *! version 4.2.0 17apr2017}{...}
 {vieweralsosee "[R] areg" "help areg"}{...}
 {vieweralsosee "[R] xtreg" "help xtreg"}{...}
 {vieweralsosee "[R] ivregress" "help ivregress"}{...}
diff --git a/src/reghdfe_class.mata b/src/reghdfe_class.mata
index 3d4682f..889fc94 100644
--- a/src/reghdfe_class.mata
+++ b/src/reghdfe_class.mata
@@ -102,6 +102,7 @@ class FixedEffects
     `StringRowVector'       dofadjustments // firstpair pairwise cluster continuous
     `Varname'               groupvar
     `String'                residuals
+    `RowVector'             kept // 1 if the regressors are not deemed as omitted (by partial_out+cholsolve+invsym)
     `String'                diopts
 
     // Output
@@ -187,7 +188,6 @@ class FixedEffects
     // -data- is either a varlist or a matrix
     `Variables'             y
     `Varlist'               vars
-    `RowVector'             idx
     `Integer'               i
 
     if (eltype(data) == "string") {
@@ -214,13 +214,6 @@ class FixedEffects
 
         assert_msg(cols(y)==cols(vars), "st_data() constructed more columns than expected")
 
-        if (timeit) timer_on(52)
-        idx = `selectindex'(colmax(abs(colminmax(y))) :== 0)
-        if (timeit) timer_off(52)
-        if (cols(idx) & verbose>-1) {
-            printf("{err}WARNING: after demeaning, some variables are just zeros: %s\n", invtokens(vars[idx]))
-        }
-
         if (timeit) timer_on(53)
         _partial_out(y, save_tss)
         if (timeit) timer_off(53)
@@ -306,6 +299,7 @@ class FixedEffects
     `Real'                  y_mean
     `Vector'                lhs
     `Vector'                alphas
+    `StringRowVector'       vars
     `Integer'               i
 
     if (args()<2 | save_tss==.) save_tss = 0
@@ -349,6 +343,9 @@ class FixedEffects
     if (timeit) timer_off(60)
 
 
+    // Compute 2-norm of each var, to see if we need to drop as regressors
+    kept = diagonal(cross(y, y))'
+
     // Intercept LSMR case
     if (acceleration=="lsmr") {
         // RRE benchmarking
@@ -366,45 +363,54 @@ class FixedEffects
         }
         return
     }
+    else {
+        // Standardize variables
+        if (verbose > 0) printf("{txt}    - Standardizing variables\n")
+        if (timeit) timer_on(61)
+        stdevs = reghdfe_standardize(y)
+        if (timeit) timer_off(61)
 
-    // Standardize variables
-    if (verbose > 0) printf("{txt}    - Standardizing variables\n")
-    if (timeit) timer_on(61)
-    stdevs = reghdfe_standardize(y)
-    if (timeit) timer_off(61)
-
-    // RRE benchmarking
-    rre_true_residual = rre_true_residual / stdevs[1]
-    if (compute_rre) rre_depvar_norm = norm(y[., 1])
+        // RRE benchmarking
+        if (compute_rre) {
+            rre_true_residual = rre_true_residual / stdevs[1]
+            rre_depvar_norm = norm(y[., 1])
+        }
 
-    // Solve
-    if (verbose>0) printf("{txt}    - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance)
-    if (verbose==1) printf("{txt}    - Iterating:")
-    if (verbose>1) printf("{txt}      ")
-    converged = 0
-    iteration_count = 0
-    if (timeit) timer_on(62)
-    y = (*func_accel)(this, y, funct_transform) :* stdevs // this is like python's self
-    if (timeit) timer_off(62)
-    // converged gets updated by check_convergence()
-    
-    if (prune) {
-        assert(G==2)
-        if (timeit) timer_on(63)
-        _expand_1core(y)
-        if (timeit) timer_off(63)
+        // Solve
+        if (verbose>0) printf("{txt}    - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance)
+        if (verbose==1) printf("{txt}    - Iterating:")
+        if (verbose>1) printf("{txt}      ")
+        converged = 0 // converged will get updated by check_convergence()
+        iteration_count = 0
+        if (timeit) timer_on(62)
+        y = (*func_accel)(this, y, funct_transform) :* stdevs // 'this' is like python's self
+        if (timeit) timer_off(62)
+        
+        if (prune) {
+            assert(G==2)
+            if (timeit) timer_on(63)
+            _expand_1core(y)
+            if (timeit) timer_off(63)
+        }
     }
 
     // Standardizing makes it hard to detect values that are perfectly collinear with the absvars
-    // in which case they should be 0.00 but they end up as 1e-16
+    // in which case they should be 0.00 but they end up as e.g. 1e-16
     // EG: reghdfe price ibn.foreign , absorb(foreign)
 
     // This will edit to zero entire columns where *ALL* values are very close to zero
     if (timeit) timer_on(64)
-    needs_zeroing = colmax(abs(colminmax(y))) :< 1e-8 // chose something relatively close to epsilon() ~ 1e-16
-    needs_zeroing = `selectindex'(needs_zeroing)
+    kept = (diagonal(cross(y, y))' :/ kept) :> (tolerance*1e-1)
+    needs_zeroing = `selectindex'(!kept)
     if (cols(needs_zeroing)) {
         y[., needs_zeroing] = J(rows(y), cols(needs_zeroing), 0)
+        vars = tokens(varlist)
+        assert_msg(cols(vars)==cols(y), "cols(vars)!=cols(y)")
+        for (i=1; i<=cols(vars); i++) {
+            if (!kept[i] & verbose>-1) {
+                printf("{txt}note: omitted %s; close to zero after partialling-out (tol =%3.1e)\n", vars[i], tolerance*1e-1)
+            }
+        }
     }
     if (timeit) timer_off(64)
 }
diff --git a/src/reghdfe_common.mata b/src/reghdfe_common.mata
index 7d9a28b..e113608 100644
--- a/src/reghdfe_common.mata
+++ b/src/reghdfe_common.mata
@@ -132,7 +132,6 @@ mata:
 	`Variable'				resid
 	`Real'					eps
 	`Integer'				i
-	`StringRowVector'		indepvars
 
 	if (S.timeit) timer_on(90)
 	reghdfe_solve_ols(S, X, b=., V=., N=., rank=., df_r=., resid=.)
@@ -142,15 +141,14 @@ mata:
 	
 	// Add "o." prefix to omitted regressors
 	eps = sqrt(epsilon(1))
-	indepvars = tokens(S.indepvars)
 	for (i=1; i<=rows(b); i++) {
-		if (abs(b[i])<eps) {
-			printf("{txt}note: %s omitted because of collinearity\n", indepvars[i])
-			stata(sprintf("_ms_put_omit %s", indepvars[i]))
-			indepvars[i] = st_global("s(ospec)")
+		if (b[i]==0 & S.verbose > -1) {
+			printf("{txt}note: %s omitted because of collinearity\n", tokens(S.indepvars)[i])
+			//stata(sprintf("_ms_put_omit %s", indepvars[i]))
+			//indepvars[i] = st_global("s(ospec)")
+			// This is now one in reghdfe.ado with -_ms_findomitted-
 		}
 	}
-	S.indepvars = invtokens(indepvars)
 
 	st_matrix(Vname, V)
 	st_numscalar(nname, N)
@@ -178,12 +176,11 @@ mata:
                                   `Integer' df_r,
                                   `Vector' resid)
 {
-	// Hack: the first col of X is actually y
+	// Hack: the first col of X is actually y!
 	`Integer'				K, KK
 	`Matrix'				xx, inv_xx, W, inv_V
 	`Vector' 				xy, w
 	`Integer'				used_df_r
-	`RowVector'				kept
 
 	if (S.vcetype == "unadjusted" & S.weight_type=="pweight") S.vcetype = "robust"
 	if (S.verbose > 0) printf("\n{txt} ## Solving least-squares regression of partialled-out variables\n\n")
@@ -215,8 +212,31 @@ mata:
 	xy = K ? xx[| 2 , 1 \ K+1 , 1 |] : J(0, 1, .)
 	xx = K ? xx[| 2 , 2 \ K+1 , K+1 |] : J(0, 0, .)
 	if (S.timeit) timer_off(91)
-	
-	b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol?
+
+	// This matrix indicates what regressors are not collinear
+	S.kept = K ? S.kept[2..K+1] : J(1, 0, .)
+
+	// Bread of the robust VCV matrix
+	// Compute this early so we can update the list of collinear regressors
+	if (S.timeit) timer_on(95)
+	inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, S.kept)
+	S.df_m = rank = K - diag0cnt(inv_xx)
+	KK = rank + S.df_a
+	S.df_r = N - KK // replaced when clustering
+	if (S.timeit) timer_off(95)
+
+	// Compute betas
+	if (S.has_weights) {
+		b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol?
+	}
+	else {
+		// This is more numerically accurate but doesn't handle weights
+		// See: http://www.stata.com/statalist/archive/2012-02/msg00956.html
+		b = J(K, 1, 0)
+		if (cols(S.kept)) {
+			b[S.kept] = qrsolve(X[., 1:+S.kept], X[., 1])
+		}
+	}
 
 	if (S.timeit) timer_on(92)
 	resid = X * (1 \ -b) // y - X * b
@@ -236,13 +256,6 @@ mata:
 	S.rss = quadcross(resid, w, resid) // do before reghdfe_robust() modifies w
 	if (S.timeit) timer_off(94)
 
-	// Bread of the robust VCV matrix
-	if (S.timeit) timer_on(95)
-	inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, kept=.) // invsym(xx, 1..K)
-	S.df_m = rank = K - diag0cnt(inv_xx)
-	KK = rank + S.df_a
-	S.df_r = N - KK // replaced when clustering
-	if (S.timeit) timer_off(95)
 
 	// Compute full VCE
 	if (S.timeit) timer_on(96)
@@ -263,16 +276,16 @@ mata:
 
 	// Wald test: joint significance
 	if (S.timeit) timer_on(97)
-	inv_V = invsym(V[kept, kept]) // this might not be of full rank but numerical inaccuracies hide it
+	inv_V = invsym(V[S.kept, S.kept]) // this might not be of full rank but numerical inaccuracies hide it
 	if (diag0cnt(inv_V)) {
 		if (S.verbose > -1) printf("{txt}(Warning: missing F statistic; dropped variables due to collinearity or too few clusters)\n")
 		W = .
 	}
-	else if (length(b[kept])==0) {
+	else if (length(b[S.kept])==0) {
 		W = .
 	}
 	else {
-		W = b[kept]' * inv_V * b[kept] / S.df_m
+		W = b[S.kept]' * inv_V * b[S.kept] / S.df_m
 		if (missing(W) & S.verbose > -1) printf("{txt}(Warning: missing F statistic)\n")
 	}
 	if (S.timeit) timer_off(97)
diff --git a/test/omitted.do b/test/omitted.do
new file mode 100644
index 0000000..2dd1135
--- /dev/null
+++ b/test/omitted.do
@@ -0,0 +1,123 @@
+noi cscript "reghdfe: ensure we omit the correct variables" adofile reghdfe
+
+* Setup
+	local included_e ///
+		scalar: N rmse tss rss r2 r2_a df_r df_m ll ll_0 /// mss F
+		matrix: trim_b trim_V ///
+		macros: wexp wtype
+
+* [TEST] Simplest case
+	sysuse auto, clear
+	gen z = 0
+	reghdfe price z, noab
+	matrix b = e(b)
+	assert b[1,1]==0
+	matrix list b
+	_ms_omit_info b
+	assert r(k_omit)==1
+
+
+* [TEST] Simple case
+* https://github.com/sergiocorreia/reghdfe/issues/93
+
+	sysuse auto, clear
+	gen lowrep = rep78<4
+	loc lhs price
+	loc rhs lowrep
+	loc absvars rep78
+
+	* 1. Run benchmark
+	areg `lhs' `rhs', absorb(`absvars')
+	matrix list e(b)
+
+	trim_cons
+	local bench_df_a = e(df_a)
+	storedresults save benchmark e()
+	
+	* 2. Run reghdfe
+	reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
+	matrix list e(b)
+	notrim
+	storedresults compare benchmark e(), tol(1e-12) include(`included_e')
+	assert `bench_df_a'==e(df_a)-1
+
+* [TEST] https://github.com/sergiocorreia/reghdfe/issues/95
+	clear
+	set obs 100
+	set seed 123
+	gen double y = runiform() * 1e8
+	gen double z = runiform()
+	gen byte c = 1
+
+	loc lhs z
+	loc rhs y
+	loc absvars c
+
+	* 1. Run benchmark
+	areg `lhs' `rhs', absorb(`absvars')
+	matrix list e(b)
+
+	trim_cons
+	local bench_df_a = e(df_a)
+	storedresults save benchmark e()
+	
+	* 2. Run reghdfe
+	reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
+	matrix list e(b)
+	notrim
+	storedresults compare benchmark e(), tol(1e-12) include(`included_e')
+	assert `bench_df_a'==e(df_a)-1
+
+
+* [TEST] https://github.com/sergiocorreia/reghdfe/issues/94
+	clear
+input w x y unity
+w          x          y      unity
+1    -1.10565   -1.223344       1
+1   -1.109139   -1.127615       1
+0   -.8497003   -1.207231       1
+1   -.2734231   -1.294683       1
+0    .0063879   -.8343916       1
+1    .7669702   -.8245103       1 
+end
+
+	gen z = 0
+	reghdfe y z, noab
+
+	reghdfe y i.w##i.unity x, a(unity) tol(1e-12)
+	matrix list e(b)
+	matrix list e(V)
+
+	areg y i.w##i.unity x, a(unity)
+	matrix list e(b)
+	matrix list e(V)
+
+
+	loc lhs y
+	loc rhs i.w##i.unity x
+	loc absvars unity
+
+	* 1. Run benchmark
+	areg `lhs' `rhs', absorb(`absvars')
+	matrix list e(b)
+
+	trim_cons
+	local bench_df_a = e(df_a)
+	storedresults save benchmark e()
+	
+	* 2. Run reghdfe
+	reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
+	matrix list e(b)
+	notrim
+	matrix b = e(b)
+	matrix V = e(V)
+	assert abs(SR__benchmark_trim_b[1,2] - b[1,1]) < 1e-8
+	assert abs(SR__benchmark_trim_b[1,6] - b[1,4]) < 1e-8
+	assert abs(SR__benchmark_trim_V[2,2] - V[1,1]) < 1e-8
+	assert abs(SR__benchmark_trim_V[6,6] - V[4,4]) < 1e-8
+
+	* Done!
+
+storedresults drop benchmark
+clear matrix
+exit
diff --git a/test/testall.do b/test/testall.do
index 5ed6aef..1306d28 100644
--- a/test/testall.do
+++ b/test/testall.do
@@ -22,6 +22,7 @@
 
 	run missing-absvar
 	run options
+	run omitted
 	run alphas
 	
 	cap run cluster-after-expand