***************************************************************************;
* Non-Linear Decomposition Technique for Logit or Probit Model            *;
* Randomized Ordering of Variables - Addresses Path Dependence            *;
* Updated on 6/9/15                                                       *;
* Originally developed in:                                                *;
*   Fairlie, Robert W. 1999. "The Absence of the African-American Owned   *;
*     Business: An Analysis of the Dynamics of Self-Employment,"          *;
*     Journal of Labor Economics, 17(1): 80-108.                          *;
* Revised to randomly match black/white distributions in:                 *;
*   Fairlie, Robert W., and Alicia M. Robb. 2007. "Why are Black-Owned    *;
*     Businesses Less Successful than White-Owned Businesses: The Role    *;
*     of Families, Inheritances, and Business Human Capital,"             *;
*     Journal of Labor Economics, 25(2): 289-323.                         *;
* Revised to randomize variable ordering and incorporate sample weights   *;
* if needed as discussed in:                                              *;
*   Fairlie, Robert W. 2014. "Addressing Path Dependence and              *;
*     Incorporating Sample Weights in the Nonlinear Blinder-Oaxaca        *;
*     Decomposition Technique for Logit, Probit and Other Nonlinear       *;
*     Models," University of California, Santa Cruz Working Paper.        *;
***************************************************************************;

* NOTES:
* Currently set for use with sample weights, see NOTE:WEIGHTS to remove sample weights;
* Currently set for pooled coefficient estimates, see NOTE:POOLED to change to white or 
  minority coefficient estimates;
* Currently set for 100 iterations, see NOTE:ITERATIONS to change;
* Currently set for Logit model, see NOTE:PROBIT to change;

* Example data set can be downloaded at http://people.ucsc.edu/~rfairlie/decomposition/;

libname sasdata 'c:\temp\decomp';
options obs=max;
options nolabel ls=75 ps=140;

* specify number of iterations;
* NOTE:ITERATIONS change this to 1000 or higher for final run, but test with fewer;
%let numiterations=100;

* NOTE:POOLED define race variables to be included only in pooled logits;
%let r=4;
%let racevars=black latino natamer asian;

* specify total number of independent variables, not including race dummies above;
%let k=31;

* define categories of variables for decomposition;
%let definegroup1 = female age;
%let definegroup2 = married prevmar children chld617;
%let definegroup3 = hsgrad somcol college gradsch;
%let definegroup4 = inc1015 inc1520 inc2025 inc2530 inc3035 inc3540
                    inc4050 inc5060 inc6075 incgt75;
%let definegroup5 = midatlan encent wncent satlan escent wscent mountain pacific;
%let definegroup6 = notcc notmsa notid;
* define short labels for each group of variables for final table output;
%let labelgroup1="Gender/Age";
%let labelgroup2="Family";
%let labelgroup3="Education";
%let labelgroup4="Income";
%let labelgroup5="Region";
%let labelgroup6="City";

* rename minority variables;
%let mdefinegroup1 = mfemale mage;
%let mdefinegroup2 = mmarried mprevmar mchildre mchld617;
%let mdefinegroup3 = mhsgrad msomcol mcollege mgradsch;
%let mdefinegroup4 = minc1015 minc1520 minc2025 minc2530 minc3035 minc3540
                     minc4050 minc5060 minc6075 mincgt75;
%let mdefinegroup5 = mmidatla mencent mwncent msatlan mescent mwscent mmountai mpacific;
%let mdefinegroup6 = mnotcc mnotmsa mnotid;

* rename white variables;
%let wdefinegroup1 = wfemale wage;
%let wdefinegroup2 = wmarried wprevmar wchildre wchld617;
%let wdefinegroup3 = whsgrad wsomcol wcollege wgradsch;
%let wdefinegroup4 = winc1015 winc1520 winc2025 winc2530 winc3035 winc3540
                     winc4050 winc5060 winc6075 wincgt75;
%let wdefinegroup5 = wmidatla wencent wwncent wsatlan wescent wwscent wmountai wpacific;
%let wdefinegroup6 = wnotcc wnotmsa wnotid;

* combine groups;
%let vars= &definegroup1 &definegroup2 &definegroup3 &definegroup4 &definegroup5 &definegroup6;
%let mvars= &mdefinegroup1 &mdefinegroup2 &mdefinegroup3 &mdefinegroup4 &mdefinegroup5 &mdefinegroup6;
%let wvars= &wdefinegroup1 &wdefinegroup2 &wdefinegroup3 &wdefinegroup4 &wdefinegroup5 &wdefinegroup6;


* prepare original data for program;
data temp;
  set sasdata.finaldecomp00;
  mergeobs=1;
* define dependent variable;
  y=hcomp;
* delete observations with any missing values for dependent or independent variables;
  if y=. or hsgrad=. or inc1015=. then delete;
* delete observations with missing, zero or negative weights;
  if wgt=. or wgt<=0 then delete;
* NOTE:WEIGHTS set all weights to 1 to run decomposition without sample weights;
  *wgt=1;
* define sample - e.g. only keep working-age adults for this run;
  if age<25 or age>55 then delete;

* define subset of data for estimating coefficients;
* currently set to use all groups (pooled sample) to estimate coefficents;
* could be changed to use sample for only one group (e.g. white or minority coefficients);
data regdata;
  set temp;
  * if white=1 or black=1;

* Create title based on sample chosen for data used in regression;
title 'Decomposition: Home Computer - Pooled (All Races) Coefficients - Random Ordering';

* create minority sample with minority variable names;
* define which minority group is used;
data minority (keep=ym wgtm &mvars mergeobs);
  set temp;
  array varsa(&k) &vars;
  array mvarsa(&k) &mvars;
  ym=y;
  wgtm=wgt;
  do i=1 to &k;
     mvarsa(i)=varsa(i);
  end;
  if black=1 then output;

* create full white sample with white variable names;
data white (keep=yw wgtw &wvars mergeobs);
  set temp;
  array varsa(&k) &vars;
  array wvarsa(&k) &wvars;
  yw=y;
  wgtw=wgt;
  do i=1 to &k;
     wvarsa(i)=varsa(i);
  end;
  if white=1 then output;

* print out full sample means;
proc means data=minority;
  weight wgtm;
  title2 'Minority Means - Full Sample';
proc means data=white;
  weight wgtw;
  title2 'White Means - Full Sample';

* calculate means of dependent variables for full sample;
* these values are used to calculate the total gap in the decomposition;
proc means data=minority noprint;
  var ym;
  weight wgtm;
  output out=ymdata mean=ymfull n=mn;
proc means data=white noprint;
  var yw;
  weight wgtw;
  output out=ywdata mean=ywfull n=wn;

* define global variables for full white and full minority sample sizes;
data ymdata2;
  set ymdata;
  call symput('mobs',mn);
run;
data ywdata2;
  set ywdata;
  call symput('wobs',wn);
run;

* estimate logit model to obtain coefficients;
* NOTE:PROBIT to change to probit use link=normit;
* NOTE:POOLED set for pooled or specific group sample above;
proc logistic data=regdata outest=orgcoefs covout descending;
  model y=&racevars &vars / link=logit;
  weight wgt / normalize;
  title2 'Logit for Coefficients';
run;

* remove race dummies from coefficient dataset;
* NOTE:POOLED only need this for pooled estimates;
data coefs (drop=&racevars  _link_ _type_ _status_ _name_ _lnlike_);
  set orgcoefs;
  mergeobs=1;
  if _n_=1;  /* coefs are in first row */

* calculate predicted probabilities for both samples;
data white;
  merge white coefs;
  by mergeobs;
  array coefsa(&k) &vars;
  array wvarsa(&k) &wvars;
  xbeta=intercept;
  do i=1 to &k;
     xbeta=xbeta+wvarsa(i)*coefsa(i);
  end;     
  wordprob=exp(xbeta)/(1+exp(xbeta));
  *NOTE:PROBIT use normal distribution for probit; 

data minority;
  merge minority coefs;
  by mergeobs;
  array coefsa(&k) &vars;
  array mvarsa(&k) &mvars;
  xbeta=intercept;
  do i=1 to &k;
     xbeta=xbeta+mvarsa(i)*coefsa(i);
  end;     
  mordprob=exp(xbeta)/(1+exp(xbeta));
  *NOTE:PROBIT use normal distribution for probit; 


* create empty starting dataset for iterations;
data means2;
  set _null_;

* create macro for iterations;
%macro simulate;

%do i=1 %to &numiterations;

  * create random subsample of whites with same sample size as minorities;
  proc surveyselect data=white method=pps_wr rep=1 sampsize=&mobs seed=&i 
       out=white1 outhits noprint;
	   size wgtw;
  
  * randomly order white subsample prior to matching;
  data white2;
    set white1;
    random1=ranuni(&i);
  proc sort data=white2;
    by random1;
  
  * create random subsample of minorities with same sample size as minorities;
  proc surveyselect data=minority method=pps_wr rep=1 sampsize=&mobs seed=&i 
       out=minority1 outhits noprint;
	   size wgtm;
  
  * merge datasets together for matching, random matching;
  data combined;
    merge white2 minority1;
	one=1;
  
  *randomly assign variable groups ("defined groups") to "number groups" for random ordering of variable groups;
  data rand;
	do i=1 to 6;
  	r = ranuni(&i);
   	output;
  end;
  proc sort data=rand; 
	by r;
  data orderiterationtemp;
	set rand;
	group = catt("group",i);
	mgroup = catt("mgroup",i);
    wgroup = catt("wgroup",i);
	if _n_ = 1 then	do; 
      call symput(group,"&definegroup1");
	  call symput(mgroup,"&mdefinegroup1");
	  call symput(wgroup,"&wdefinegroup1");
	end;
	else if _n_ = 2 then do;
      call symput(group,"&definegroup2");
	  call symput(mgroup,"&mdefinegroup2");
	  call symput(wgroup,"&wdefinegroup2");
	end;
	else if _n_ = 3 then do;
      call symput(group,"&definegroup3");
	  call symput(mgroup,"&mdefinegroup3");
	  call symput(wgroup,"&wdefinegroup3");
	end;
	else if _n_ = 4 then do;
      call symput(group,"&definegroup4");
	  call symput(mgroup,"&mdefinegroup4");
	  call symput(wgroup,"&wdefinegroup4");
	end;
	else if _n_ = 5 then do;
      call symput(group,"&definegroup5");
	  call symput(mgroup,"&mdefinegroup5");
	  call symput(wgroup,"&wdefinegroup5");
	end;
	else do;
      call symput(group,"&definegroup6");
	  call symput(mgroup,"&mdefinegroup6");
	  call symput(wgroup,"&wdefinegroup6");
    end;
	  
	array ordgroup {6} ordgroup1 ordgroup2 ordgroup3 ordgroup4 ordgroup5 ordgroup6;
	do j = 1 to 6;
		if _n_ = j then ordgroup[j] = i;
	end;

	one = 1;
  
  * create dataset recording which original variable group is assigned to each randomly ordered group;
  proc means data = orderiterationtemp noprint;
  	var ordgroup1 ordgroup2 ordgroup3 ordgroup4 ordgroup5 ordgroup6 one;
  	output out = orderiteration mean = ;
  * clean up dataset;	
  data orderiteration (drop = _type_ _freq_);
	set orderiteration;

  * append information about random order of original variable groups;
  data combined;
  	merge combined orderiteration;
	by one;

  * calculate decomposition components;
  data combined;
    set combined;
	* variables and coefficients need to be in new random order for iteration;
	%let newvars= &group1 &group2 &group3 &group4 &group5 &group6; 
	array newcoefsa (&k) &group1 &group2 &group3 &group4 &group5 &group6;
	
    * define distribution switches as arrays;
    array x0a(&k) &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x1a(&k) &mgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x2a(&k) &mgroup1 &mgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x3a(&k) &mgroup1 &mgroup2 &mgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x4a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &wgroup5 &wgroup6;
    array x5a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &wgroup6;
    array x6a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6;

    xb0=intercept;
    xb1=intercept;
    xb2=intercept;
    xb3=intercept;
    xb4=intercept;
    xb5=intercept;
    xb6=intercept;

    * perform white to minority variable distribution switches;
    do i=1 to &k;
       xb0=xb0+x0a(i)*newcoefsa(i);
       xb1=xb1+x1a(i)*newcoefsa(i);
       xb2=xb2+x2a(i)*newcoefsa(i);
       xb3=xb3+x3a(i)*newcoefsa(i);
       xb4=xb4+x4a(i)*newcoefsa(i);
       xb5=xb5+x5a(i)*newcoefsa(i);
       xb6=xb6+x6a(i)*newcoefsa(i);
    end;

    * calculate various predicted probabilities;
	* NOTE:PROBIT update for normal distribution;
    pred0=exp(xb0)/(1+exp(xb0));
    pred1=exp(xb1)/(1+exp(xb1));
    pred2=exp(xb2)/(1+exp(xb2));
    pred3=exp(xb3)/(1+exp(xb3));
    pred4=exp(xb4)/(1+exp(xb4));
    pred5=exp(xb5)/(1+exp(xb5));
    pred6=exp(xb6)/(1+exp(xb6));

    * calculate various pdfs for standard error calculations;
	* NOTE:PROBIT update for normal distribution;
    fhat0=pred0*(1-pred0);
    fhat1=pred1*(1-pred1);
    fhat2=pred2*(1-pred2);
    fhat3=pred3*(1-pred3);
    fhat4=pred4*(1-pred4);
    fhat5=pred5*(1-pred5);
    fhat6=pred6*(1-pred6);

    * create intercept component to derivatives;
    dc1db0=fhat0*one-fhat1*one;
    dc2db0=fhat1*one-fhat2*one;
    dc3db0=fhat2*one-fhat3*one;
    dc4db0=fhat3*one-fhat4*one;
    dc5db0=fhat4*one-fhat5*one;
    dc6db0=fhat5*one-fhat6*one;

    * calculate contribution derivatives (delta method);
    array dc1dba(&k) dc1db1-dc1db&k;
    array dc2dba(&k) dc2db1-dc2db&k;
    array dc3dba(&k) dc3db1-dc3db&k;
    array dc4dba(&k) dc4db1-dc4db&k;
    array dc5dba(&k) dc5db1-dc5db&k;
    array dc6dba(&k) dc6db1-dc6db&k;

    * create other variable components to derivatives;
   do i=1 to &k;
     dc1dba(i)=fhat0*x0a(i)-fhat1*x1a(i);
     dc2dba(i)=fhat1*x1a(i)-fhat2*x2a(i);
     dc3dba(i)=fhat2*x2a(i)-fhat3*x3a(i);
     dc4dba(i)=fhat3*x3a(i)-fhat4*x4a(i);
     dc5dba(i)=fhat4*x4a(i)-fhat5*x5a(i);
     dc6dba(i)=fhat5*x5a(i)-fhat6*x6a(i);
   end;


  *****************************;
  * calculate standard errors *;
  *****************************;

  * re-estimate logit model with new random variable ordering for covariance matrix;
  * NOTE:PROBIT to change to probit use link=normit;
  * NOTE:POOLED set for pooled or specific group sample above;
  proc logistic data=regdata outest=orgcoefs2 covout descending noprint;
    model y=&racevars &newvars / link=logit;
    weight wgt / normalize;

  * clean up coefficient/covariance dataset;
  data covar (drop=&racevars _link_ _type_ _status_ _name_ _lnlike_);
    set orgcoefs2;
    * delete coefs and rows associated with racevars;
    if _n_=1 or (3<=_n_<=(&r+2)) then delete;
  
  * calculate decomposition estimates to save and use for variance calculations;
  proc means data=combined noprint;
    var yw ym pred0-pred6
        dc1db0 dc1db1-dc1db&k
        dc2db0 dc2db1-dc2db&k
        dc3db0 dc3db1-dc3db&k
        dc4db0 dc4db1-dc4db&k
        dc5db0 dc5db1-dc5db&k
        dc6db0 dc6db1-dc6db&k
		ordgroup1-ordgroup6;
    output out=means1 mean=;
  
  * create separate datasets to read into proc iml;
  * NOTE: check to make sure variables are in the proper order to match to the covariance matrix;
      data cont1 (keep=dc1db0 dc1db1-dc1db&k)
           cont2 (keep=dc2db0 dc2db1-dc2db&k)
           cont3 (keep=dc3db0 dc3db1-dc3db&k)
           cont4 (keep=dc4db0 dc4db1-dc4db&k)
           cont5 (keep=dc5db0 dc5db1-dc5db&k)
           cont6 (keep=dc6db0 dc6db1-dc6db&k);
        set means1;
	  
      proc iml;
        use covar;
        read all var _num_ into V;
        use cont1;
        read all var _num_ into DC1DB;
        use cont2;
        read all var _num_ into DC2DB;
        use cont3;
        read all var _num_ into DC3DB;
        use cont4;
        read all var _num_ into DC4DB;
        use cont5;
        read all var _num_ into DC5DB;
        use cont6;
        read all var _num_ into DC6DB;

      * calculate standard error;
        VAR1=DC1DB*V*t(DC1DB);
        VAR2=DC2DB*V*t(DC2DB);
        VAR3=DC3DB*V*t(DC3DB);
        VAR4=DC4DB*V*t(DC4DB);
        VAR5=DC5DB*V*t(DC5DB);
        VAR6=DC6DB*V*t(DC6DB);

        create vardata var {var1 var2 var3 var4 var5 var6};
        append;

  * merge variance calculations from proc iml to decomp dataset;
  data means1 (keep=yw ym pred0-pred6 var1-var6 cont1-cont6 ordgroup1-ordgroup6);
    merge means1 vardata;
	cont1=pred0-pred1;
  	cont2=pred1-pred2;
  	cont3=pred2-pred3;
 	cont4=pred3-pred4;
  	cont5=pred4-pred5;
  	cont6=pred5-pred6;
  
  * record orginal variable group contributions from random order variable group contributions;
  data means1 (keep = yw ym contg1-contg6 vargroup1-vargroup6);
  	set means1;
	array ordgroupa{6} ordgroup1-ordgroup6;
	array contgroupa{6} contg1-contg6;
	array vargroupa{6} vargroup1-vargroup6;
	array conta{6} cont1-cont6;
	array vara{6} var1-var6;
	do j = 1 to 6;
		do i = 1 to 6;
			if ordgroupa[j] = i then do;
				contgroupa[j] = conta[i];
				vargroupa[j] = vara[i];
			end;
		end;
	end;
    
  * append latest iteration results to all previous iterations results;
  data means2;
    set means2 means1;
  run;
%end;

%mend;

run;

* turn off notes because macro generates a lot of information, remove this option 
for debugging program;
options nonotes;

* run simulation - note that it runs numiterations times because of do loop above;
%simulate;
run;

* calculate means of decomposition runs;
proc means data=means2;
  title2 'Mean Values of Contribution Estimates from Simulations';
  var yw ym contg1-contg6 vargroup1-vargroup6;
  output out=meandecomp mean=;

* append the full sample means for ys and calculate percent contributions;
* dataset now has only one obs for the means;
data meandecomp;
  merge meandecomp ywdata ymdata;
  gap=ywfull-ymfull;
  perc1=contg1/gap;
  perc2=contg2/gap;
  perc3=contg3/gap;
  perc4=contg4/gap;
  perc5=contg5/gap;
  perc6=contg6/gap;
  se1=sqrt(vargroup1);
  se2=sqrt(vargroup2);
  se3=sqrt(vargroup3);
  se4=sqrt(vargroup4);
  se5=sqrt(vargroup5);
  se6=sqrt(vargroup6);
  label contg1=&labelgroup1;
  label contg2=&labelgroup2;
  label contg3=&labelgroup3;
  label contg4=&labelgroup4;
  label contg5=&labelgroup5;
  label contg6=&labelgroup6;

options label;

* format output for final decomposition table;
* outputs contribution estimates, gap percents and standard errors;
proc means data=meandecomp mean;
  title2 'Final Output for Table - Means Values for Contribution Estimates';
  var ywfull ymfull gap
      contg1 se1 perc1 contg2 se2 perc2 contg3 se3 perc3 contg4 se4 perc4 
      contg5 se5 perc5 contg6 se6 perc6;
run;