**********************************************************************;
* Example of Non-Linear Decomposition Technique for Logit Model      *;
* Simplified Version without Standard Errors                         *;
* Updated on 7/3/13                                                  *;
* Used in:                                                           *;   
* Fairlie, Robert W. 1999 "The Absence of the African-American Owned *;
*   Business: An Analysis of the Dynamics of Self-Employment,"       *;
*   Journal of Labor Economics, 17(1): 80-108.                       *;
* Fairlie, Robert W. 2005. "An Extension of the Blinder-Oaxaca       *;
*   Decomposition Technique to Logit and Probit Models," Journal of  *;
*   Economic and Social Measurement, 30(4): 305-316.                 *;
**********************************************************************;

libname sasdata 'c:\temp\comp\decomp\examples';
options obs=max;
options nolabel ls=75 ps=140;

title 'Home Computer - Pooled 2 (All Races) Coefficients';

* specify number of simulations;
* NOTE: change this to 1000 for final run;
%let numsims=10;

* define race variables to be included only in pooled logits;
%let r=4;
%let racevars=black latino natamer asian;

* specify number and names of independent variables;
%let k=31;

* define categories of variables for decomposition;
%let group1= female age;
%let group2= married prevmar children chld617;
%let group3= hsgrad somcol college gradsch;
%let group4= inc1015 inc1520 inc2025 inc2530 inc3035 inc3540
             inc4050 inc5060 inc6075 incgt75;
%let group5= midatlan encent wncent satlan escent wscent mountain pacific;
%let group6= notcc notmsa notid;

* rename minority variables;
%let mgroup1= mfemale mage;
%let mgroup2= mmarried mprevmar mchildre mchld617;
%let mgroup3= mhsgrad msomcol mcollege mgradsch;
%let mgroup4= minc1015 minc1520 minc2025 minc2530 minc3035 minc3540
              minc4050 minc5060 minc6075 mincgt75;
%let mgroup5= mmidatla mencent mwncent msatlan mescent mwscent mmountai mpacific;
%let mgroup6= mnotcc mnotmsa mnotid;

* rename white variables;
%let wgroup1= wfemale wage;
%let wgroup2= wmarried wprevmar wchildre wchld617;
%let wgroup3= whsgrad wsomcol wcollege wgradsch;
%let wgroup4= winc1015 winc1520 winc2025 winc2530 winc3035 winc3540
              winc4050 winc5060 winc6075 wincgt75;
%let wgroup5= wmidatla wencent wwncent wsatlan wescent wwscent wmountai wpacific;
%let wgroup6= wnotcc wnotmsa wnotid;

* combine groups;
%let vars= &group1 &group2 &group3 &group4 &group5 &group6;
%let mvars= &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6;
%let wvars= &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;


* prepare original data for program;
data temp;
  set sasdata.finaldecomp00;
  mergeobs=1;
* define dependent variable;
  y=hcomp;
* delete observations with any missing values for dep or indep vars;
  if y=. or hsgrad=. or inc1015=. then delete;
* define sample - e.g. only keep working-age adults for this run;
  if age<25 or age>55 then delete;


* define subset of data for estimating coefficients;
* use this to estimate coefficients using only one group e.g. white coefficients;
* currently set to use all groups (pooled sample) to etimate coefficents;
data temp2;
  set temp;

* create minority sample with minority variable names;
* define which minority group is used;
data minority (keep=ym &mvars mergeobs);
  set temp;
  array varsa(&k) &vars;
  array mvarsa(&k) &mvars;
  ym=y;
  do i=1 to &k;
     mvarsa(i)=varsa(i);
  end;
  if black=1 then output;

* create full white sample with white variable names;
data white (keep=yw &wvars mergeobs);
  set temp;
  array varsa(&k) &vars;
  array wvarsa(&k) &wvars;
  yw=y;
  do i=1 to &k;
     wvarsa(i)=varsa(i);
  end;
  if white=1 then output;

* print out full sample means;
proc means data=minority;
  title2 'Minority Means';
proc means data=white;
  title2 'White Means - Full Sample';

* calculate means of dependent variables for full sample;
* these values are used to calculate the total gap in the decomposition;
proc means data=minority noprint;
  var ym;
  output out=ymdata mean=ymfull;
proc means data=white noprint;
  var yw;
  output out=ywdata mean=ywfull;

* estimate logit model to obtain coefficients;
* set for pooled or specific group sample above;
proc logistic data=temp2 outest=orgcoefs covout descending;
  model y=&racevars &vars / link=logit;
  title2 'Logit for Coefficients';

* remove race dummies from coefficient dataset;
* only need this for pooled estimates;
data coefs (drop=&racevars  _link_ _type_ _status_ _name_ _lnlike_);
  set orgcoefs;
  mergeobs=1;
  if _n_=1;  /* coefs are in first row */


* calculate predicted probabilities for both samples;
data white;
  merge white coefs;
  by mergeobs;
  array coefsa(&k) &vars;
  array wvarsa(&k) &wvars;
  xbeta=intercept;
  do i=1 to &k;
     xbeta=xbeta+wvarsa(i)*coefsa(i);
  end;     
  wordprob=exp(xbeta)/(1+exp(xbeta));

data minority;
  merge minority coefs;
  by mergeobs;
  array coefsa(&k) &vars;
  array mvarsa(&k) &mvars;
  xbeta=intercept;
  do i=1 to &k;
     xbeta=xbeta+mvarsa(i)*coefsa(i);
  end;     
  mordprob=exp(xbeta)/(1+exp(xbeta));


* sort minority data by predicted probabilities for later matching;
proc sort data=minority;
  by mordprob;


* create empty starting dataset for simulations;
data means2;
  set _null_;

* create macro for simulations;
%macro simulate;

%do i=1 %to &numsims;

* first, delete white observations to match black sample size;
data white1;
  set white;
  random1=ranuni(&i);
proc sort data=white1;
  by random1;
data white2 (drop=ym);
  merge minority (keep=ym) white1;
  if ym=. then delete;  /* deletes extra white observations */

* second, reorder random white subsample by predicted probabilities;
proc sort data=white2;
  by wordprob;

* third, merge datasets together for matching;
data combined;
  merge white2 minority;



* calculate decomposition components;
data combined;
  set combined;
  one=1;
  array coefsa(&k) &vars;
* define distribution switches as arrays;
  array x0a(&k) &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
  array x1a(&k) &mgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
  array x2a(&k) &mgroup1 &mgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6;
  array x3a(&k) &mgroup1 &mgroup2 &mgroup3 &wgroup4 &wgroup5 &wgroup6;
  array x4a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &wgroup5 &wgroup6;
  array x5a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &wgroup6;
  array x6a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6;

  xb0=intercept;
  xb1=intercept;
  xb2=intercept;
  xb3=intercept;
  xb4=intercept;
  xb5=intercept;
  xb6=intercept;

* perform white to black variable distribution switches;
  do i=1 to &k;
     xb0=xb0+x0a(i)*coefsa(i);
     xb1=xb1+x1a(i)*coefsa(i);
     xb2=xb2+x2a(i)*coefsa(i);
     xb3=xb3+x3a(i)*coefsa(i);
     xb4=xb4+x4a(i)*coefsa(i);
     xb5=xb5+x5a(i)*coefsa(i);
     xb6=xb6+x6a(i)*coefsa(i);
  end;

* calculate various predicted probabilities;
  pred0=exp(xb0)/(1+exp(xb0));
  pred1=exp(xb1)/(1+exp(xb1));
  pred2=exp(xb2)/(1+exp(xb2));
  pred3=exp(xb3)/(1+exp(xb3));
  pred4=exp(xb4)/(1+exp(xb4));
  pred5=exp(xb5)/(1+exp(xb5));
  pred6=exp(xb6)/(1+exp(xb6));


* calculate decomposition estimates for this simulation;
proc means data=combined noprint;
  var yw ym pred0-pred6;
  output out=means1 mean=;

* append latest simulation results to all previous simulation results;
data means2;
  set means2 means1;

%end;

%mend;

run;

* turn off notes because macro generates a lot of information;
* remove this option for debugging;
options nonotes;

* run simulation - note that it runs numsims times because of do loop above;
%simulate;
run;



* calculate contribution estimates from changes in predicted probabilities;
data means2;
  set means2;
  cont1=pred0-pred1;
  cont2=pred1-pred2;
  cont3=pred2-pred3;
  cont4=pred3-pred4;
  cont5=pred4-pred5;
  cont6=pred5-pred6;
  cont7=pred6-pred7;
  cont8=pred7-pred8;

* calculate means of decomposition runs;
proc means data=means2;
  title2 'Mean Values of Contribution Estimates from Simulations';
  var yw pred0 pred1-pred6 ym cont1-cont6;
  output out=meandecomp mean=;

* append the full sample means for ys and calculate percent contributions;
* dataset now has only one obs for the means;
data meandecomp;
  merge meandecomp ywdata ymdata;
  gap=ywfull-ymfull;
  perc1=cont1/gap;
  perc2=cont2/gap;
  perc3=cont3/gap;
  perc4=cont4/gap;
  perc5=cont5/gap;
  perc6=cont6/gap;

  

* format output for final decomposition table;
* outputs contribution estimates, gap percents and standard errors;
proc means data=meandecomp;
  title2 'Final Output for Table - Mean Values of Decomposition Runs';
  var ywfull ymfull gap
      cont1 perc1 cont2 perc2 cont3 perc3 cont4 perc4 
      cont5 perc5 cont6 perc6;


* run decomposition for all variables;
proc means data=white;
  var yw wordprob;
  title2 'Mean Predicted Probability for White Full Sample';
proc means data=minority;
  var ym mordprob;
  title2 'Mean Predicted Probability for Minority Full Sample';
run;