***************************************************************************;
* Non-Linear Decomposition Technique for Logit or Probit Model            *;
* Specified Ordering of Variables - Faster Version                        *;
* Updated on 6/9/15                                                       *;
* Originally developed in:                                                *;
*   Fairlie, Robert W. 1999. "The Absence of the African-American Owned   *;
*     Business: An Analysis of the Dynamics of Self-Employment,"          *;
*     Journal of Labor Economics, 17(1): 80-108.                          *;
* Revised to randomly match black/white distributions in:                 *;
*   Fairlie, Robert W., and Alicia M. Robb. 2007. "Why are Black-Owned    *;
*     Businesses Less Successful than White-Owned Businesses: The Role    *;
*     of Families, Inheritances, and Business Human Capital,"             *;
*     Journal of Labor Economics, 25(2): 289-323.                         *;
* Revised to incorporate sample weights if needed as discussed in:        *;
*   Fairlie, Robert W. 2014. "Addressing Path Dependence and              *;
*     Incorporating Sample Weights in the Nonlinear Blinder-Oaxaca        *;
*     Decomposition Technique for Logit, Probit and Other Nonlinear       *;
*     Models," University of California, Santa Cruz Working Paper.        *;
***************************************************************************;

* NOTES:
* Currently set for use with sample weights, see NOTE:WEIGHTS to remove sample weights;
* Currently set for pooled coefficient estimates, see NOTE:POOLED to change to white or 
  minority coefficient estimates;
* Currently set for 100 iterations, see NOTE:ITERATIONS to change;
* Currently set for Logit model, see NOTE:PROBIT to change;

* Example data set can be downloaded at http://people.ucsc.edu/~rfairlie/decomposition/;

libname sasdata 'c:\temp\decomp';
options obs=max;
options nolabel ls=75 ps=140;

* specify number of iterations;
* NOTE:ITERATIONS change this to 1000 or higher for final run, but test with fewer;
%let numiterations=100;

* NOTE:POOLED define race variables to be included only in pooled logits;
%let r=4;
%let racevars=black latino natamer asian;

* specify total number of independent variables, not including race dummies above;
%let k=31;

* define categories of variables for decomposition;
%let group1= female age;
%let group2= married prevmar children chld617;
%let group3= hsgrad somcol college gradsch;
%let group4= inc1015 inc1520 inc2025 inc2530 inc3035 inc3540
             inc4050 inc5060 inc6075 incgt75;
%let group5= midatlan encent wncent satlan escent wscent mountain pacific;
%let group6= notcc notmsa notid;
* define short labels for each group of variables for final table output;
%let labelgroup1="Gender/Age";
%let labelgroup2="Family";
%let labelgroup3="Education";
%let labelgroup4="Income";
%let labelgroup5="Region";
%let labelgroup6="City";

* rename minority variables;
%let mgroup1= mfemale mage;
%let mgroup2= mmarried mprevmar mchildre mchld617;
%let mgroup3= mhsgrad msomcol mcollege mgradsch;
%let mgroup4= minc1015 minc1520 minc2025 minc2530 minc3035 minc3540
              minc4050 minc5060 minc6075 mincgt75;
%let mgroup5= mmidatla mencent mwncent msatlan mescent mwscent mmountai mpacific;
%let mgroup6= mnotcc mnotmsa mnotid;

* rename white variables;
%let wgroup1= wfemale wage;
%let wgroup2= wmarried wprevmar wchildre wchld617;
%let wgroup3= whsgrad wsomcol wcollege wgradsch;
%let wgroup4= winc1015 winc1520 winc2025 winc2530 winc3035 winc3540
              winc4050 winc5060 winc6075 wincgt75;
%let wgroup5= wmidatla wencent wwncent wsatlan wescent wwscent wmountai wpacific;
%let wgroup6= wnotcc wnotmsa wnotid;

* combine groups;
%let vars= &group1 &group2 &group3 &group4 &group5 &group6;
%let mvars= &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6;
%let wvars= &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;


* prepare original data for program;
data temp;
  set sasdata.finaldecomp00;
  mergeobs=1;
* define dependent variable;
  y=hcomp;
* delete observations with any missing values for dependent or independent variables;
  if y=. or hsgrad=. or inc1015=. then delete;
* delete observations with missing, zero or negative weights;
  if wgt=. or wgt<=0 then delete;
* NOTE:WEIGHTS set all weights to 1 to run decomposition without sample weights;
  *wgt=1;
* define sample - e.g. only keep working-age adults for this run;
  if age<25 or age>55 then delete;

* define subset of data for estimating coefficients;
* currently set to use all groups (pooled sample) to estimate coefficents;
* could be changed to use sample for only one group (e.g. white or minority coefficients);
data regdata;
  set temp;
  * if white=1 or black=1;

* Create title based on sample chosen for data used in regression;
title 'Decomposition: Home Computer - Pooled (All Races) Coefficients - Specified Ordering';

* create minority sample with minority variable names;
* define which minority group is used;
data minority (keep=ym wgtm &mvars mergeobs);
  set temp;
  array varsa(&k) &vars;
  array mvarsa(&k) &mvars;
  ym=y;
  wgtm=wgt;
  do i=1 to &k;
     mvarsa(i)=varsa(i);
  end;
  if black=1 then output;

* create full white sample with white variable names;
data white (keep=yw wgtw &wvars mergeobs);
  set temp;
  array varsa(&k) &vars;
  array wvarsa(&k) &wvars;
  yw=y;
  wgtw=wgt;
  do i=1 to &k;
     wvarsa(i)=varsa(i);
  end;
  if white=1 then output;

* print out full sample means;
proc means data=minority;
  weight wgtm;
  title2 'Minority Means - Full Sample';
proc means data=white;
  weight wgtw;
  title2 'White Means - Full Sample';

* calculate means of dependent variables for full sample;
* these values are used to calculate the total gap in the decomposition;
proc means data=minority noprint;
  var ym;
  weight wgtm;
  output out=ymdata mean=ymfull n=mn;
proc means data=white noprint;
  var yw;
  weight wgtw;
  output out=ywdata mean=ywfull n=wn;

* define global variables for full white and full minority sample sizes;
data ymdata2;
  set ymdata;
  call symput('mobs', mn);
data ywdata2;
  set ywdata;
  call symput('wobs', wn);
run;


* estimate logit model to obtain coefficients;
* NOTE:PROBIT to change to probit use link=normit;
* NOTE:POOLED set for pooled or specific group sample above;
proc logistic data=regdata outest=orgcoefs covout descending;
  model y=&racevars &vars / link=logit;
  weight wgt / normalize;
  title2 'Logit for Coefficients';

* remove race dummies from coefficient dataset;
* NOTE:POOLED only need this for pooled estimates;
data coefs (drop=&racevars  _link_ _type_ _status_ _name_ _lnlike_);
  set orgcoefs;
  mergeobs=1;
  if _n_=1;  /* coefs are in first row */

* calculate predicted probabilities for both samples;
data white;
  merge white coefs;
  by mergeobs;
  array coefsa(&k) &vars;
  array wvarsa(&k) &wvars;
  xbeta=intercept;
  do i=1 to &k;
     xbeta=xbeta+wvarsa(i)*coefsa(i);
  end;
  wordprob=exp(xbeta)/(1+exp(xbeta));
  *NOTE:PROBIT use normal distribution for probit; 

data minority;
  merge minority coefs;
  by mergeobs;
  array coefsa(&k) &vars;
  array mvarsa(&k) &mvars;
  xbeta=intercept;
  do i=1 to &k;
     xbeta=xbeta+mvarsa(i)*coefsa(i);
  end;     
  mordprob=exp(xbeta)/(1+exp(xbeta));
  *NOTE:PROBIT use normal distribution for probit; 


* create empty starting dataset for iterations;
data means2;
  set _null_;

* create macro for iterations;
%macro simulate;

%do i=1 %to &numiterations;

  * create random subsample of whites with same sample size as minorities;
  proc surveyselect data=white method=pps_wr rep=1 sampsize=&mobs seed=&i 
       out=white1 outhits noprint;
	   size wgtw;

  * randomly order white subsample prior to matching;
  data white2;
    set white1;
    random1=ranuni(&i);
  proc sort data=white2;
    by random1;

  * create random subsample of blacks with same sample size as minorities;
  proc surveyselect data=minority method=pps_wr rep=1 sampsize=&mobs seed=&i 
       out=minority1 outhits noprint;
	   size wgtm;

  * merge datasets together for matching, random matching;
  data combined;
    merge white2 minority1;

  * calculate decomposition components;
  data combined;
    set combined;
    one=1;
    array coefsa(&k) &vars;
    * define distribution switches as arrays;
    array x0a(&k) &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x1a(&k) &mgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x2a(&k) &mgroup1 &mgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x3a(&k) &mgroup1 &mgroup2 &mgroup3 &wgroup4 &wgroup5 &wgroup6;
    array x4a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &wgroup5 &wgroup6;
    array x5a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &wgroup6;
    array x6a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6;

    xb0=intercept;
    xb1=intercept;
    xb2=intercept;
    xb3=intercept;
    xb4=intercept;
    xb5=intercept;
    xb6=intercept;

    * perform white to black variable distribution switches;
    do i=1 to &k;
       xb0=xb0+x0a(i)*coefsa(i);
       xb1=xb1+x1a(i)*coefsa(i);
       xb2=xb2+x2a(i)*coefsa(i);
       xb3=xb3+x3a(i)*coefsa(i);
       xb4=xb4+x4a(i)*coefsa(i);
       xb5=xb5+x5a(i)*coefsa(i);
       xb6=xb6+x6a(i)*coefsa(i);
    end;

    * calculate various predicted probabilities;
	* NOTE:PROBIT update for normal distribution;
    pred0=exp(xb0)/(1+exp(xb0));
    pred1=exp(xb1)/(1+exp(xb1));
    pred2=exp(xb2)/(1+exp(xb2));
    pred3=exp(xb3)/(1+exp(xb3));
    pred4=exp(xb4)/(1+exp(xb4));
    pred5=exp(xb5)/(1+exp(xb5));
    pred6=exp(xb6)/(1+exp(xb6));

    * calculate various pdfs for standard error calculations;
	* NOTE:PROBIT update for normal distribution;
    fhat0=pred0*(1-pred0);
    fhat1=pred1*(1-pred1);
    fhat2=pred2*(1-pred2);
    fhat3=pred3*(1-pred3);
    fhat4=pred4*(1-pred4);
    fhat5=pred5*(1-pred5);
    fhat6=pred6*(1-pred6);

    * create intercept component to derivatives;
    dc1db0=fhat0*one-fhat1*one;
    dc2db0=fhat1*one-fhat2*one;
    dc3db0=fhat2*one-fhat3*one;
    dc4db0=fhat3*one-fhat4*one;
    dc5db0=fhat4*one-fhat5*one;
    dc6db0=fhat5*one-fhat6*one;

    * calculate contribution derivatives (delta method);
    array dc1dba(&k) dc1db1-dc1db&k;
    array dc2dba(&k) dc2db1-dc2db&k;
    array dc3dba(&k) dc3db1-dc3db&k;
    array dc4dba(&k) dc4db1-dc4db&k;
    array dc5dba(&k) dc5db1-dc5db&k;
    array dc6dba(&k) dc6db1-dc6db&k;

    * create other variable components to derivatives;
   do i=1 to &k;
     dc1dba(i)=fhat0*x0a(i)-fhat1*x1a(i);
     dc2dba(i)=fhat1*x1a(i)-fhat2*x2a(i);
     dc3dba(i)=fhat2*x2a(i)-fhat3*x3a(i);
     dc4dba(i)=fhat3*x3a(i)-fhat4*x4a(i);
     dc5dba(i)=fhat4*x4a(i)-fhat5*x5a(i);
     dc6dba(i)=fhat5*x5a(i)-fhat6*x6a(i);
   end;


  *****************************;
  * calculate standard errors *;
  *****************************;

  * clean up coefficient/covariance dataset;
  data covar (drop=&racevars _link_ _type_ _status_ _name_ _lnlike_);
    set orgcoefs;
    * delete coefs and rows associated with racevars;
    if _n_=1 or (3<=_n_<=(&r+2)) then delete;

  * calculate decomposition estimates to save and use for variance calculations;
  proc means data=combined noprint;
    var yw ym pred0-pred6
        dc1db0 dc1db1-dc1db&k
        dc2db0 dc2db1-dc2db&k
        dc3db0 dc3db1-dc3db&k
        dc4db0 dc4db1-dc4db&k
        dc5db0 dc5db1-dc5db&k
        dc6db0 dc6db1-dc6db&k;
    output out=means1 mean=;

  * create separate datasets to read into proc iml;
  * NOTE: check to make sure variables are in the proper order to match to the covariance matrix;
      data cont1 (keep=dc1db0 dc1db1-dc1db&k)
           cont2 (keep=dc2db0 dc2db1-dc2db&k)
           cont3 (keep=dc3db0 dc3db1-dc3db&k)
           cont4 (keep=dc4db0 dc4db1-dc4db&k)
           cont5 (keep=dc5db0 dc5db1-dc5db&k)
           cont6 (keep=dc6db0 dc6db1-dc6db&k);
        set means1;

      proc iml;
        use covar;
        read all var _num_ into V;
        use cont1;
        read all var _num_ into DC1DB;
        use cont2;
        read all var _num_ into DC2DB;
        use cont3;
        read all var _num_ into DC3DB;
        use cont4;
        read all var _num_ into DC4DB;
        use cont5;
        read all var _num_ into DC5DB;
        use cont6;
        read all var _num_ into DC6DB;

      * calculate standard error;
        VAR1=DC1DB*V*t(DC1DB);
        VAR2=DC2DB*V*t(DC2DB);
        VAR3=DC3DB*V*t(DC3DB);
        VAR4=DC4DB*V*t(DC4DB);
        VAR5=DC5DB*V*t(DC5DB);
        VAR6=DC6DB*V*t(DC6DB);

        create vardata var {var1 var2 var3 var4 var5 var6};
        append;

  * merge variance calculations from proc iml to decomp dataset;
  data means1 (keep=yw ym pred0-pred6 var1-var6);
    merge means1 vardata;

  * append latest iteration results to all previous iteration results;
  data means2;
    set means2 means1;
  run;
%end;

%mend;

run;

* turn off notes because macro generates a lot of information, remove this option for debugging program;
options nonotes;

* run simulation - note that it runs numiterations times because of do loop above;
%simulate;
run;

* calculate contribution estimates from changes in predicted probabilities;
data means2;
  set means2;
  cont1=pred0-pred1;
  cont2=pred1-pred2;
  cont3=pred2-pred3;
  cont4=pred3-pred4;
  cont5=pred4-pred5;
  cont6=pred5-pred6;
  cont7=pred6-pred7;
  cont8=pred7-pred8;

* calculate means of decomposition runs;
proc means data=means2;
  title2 'Mean Values of Contribution Estimates from Simulations';
  var yw pred0 pred1-pred6 ym cont1-cont6 var1-var6;
  output out=meandecomp mean=;

* append the full sample means for ys and calculate percent contributions;
* dataset now has only one obs for the means;
data meandecomp;
  merge meandecomp ywdata ymdata;
  gap=ywfull-ymfull;
  perc1=cont1/gap;
  perc2=cont2/gap;
  perc3=cont3/gap;
  perc4=cont4/gap;
  perc5=cont5/gap;
  perc6=cont6/gap;
  se1=sqrt(var1);
  se2=sqrt(var2);
  se3=sqrt(var3);
  se4=sqrt(var4);
  se5=sqrt(var5);
  se6=sqrt(var6);
  label cont1=&labelgroup1;
  label cont2=&labelgroup2;
  label cont3=&labelgroup3;
  label cont4=&labelgroup4;
  label cont5=&labelgroup5;
  label cont6=&labelgroup6;

options label;

* format output for final decomposition table;
* outputs contribution estimates, gap percents and standard errors;
proc means data=meandecomp mean;
  title2 'Final Output for Table - Means Values for Contribution Estimates';
  var ywfull ymfull gap
      cont1 se1 perc1 cont2 se2 perc2 cont3 se3 perc3 cont4 se4 perc4 
      cont5 se5 perc5 cont6 se6 perc6;
run;