***************************************************************************; * Non-Linear Decomposition Technique for Logit or Probit Model *; * Specified Ordering of Variables - Faster Version *; * Updated on 6/9/15 *; * Originally developed in: *; * Fairlie, Robert W. 1999. "The Absence of the African-American Owned *; * Business: An Analysis of the Dynamics of Self-Employment," *; * Journal of Labor Economics, 17(1): 80-108. *; * Revised to randomly match black/white distributions in: *; * Fairlie, Robert W., and Alicia M. Robb. 2007. "Why are Black-Owned *; * Businesses Less Successful than White-Owned Businesses: The Role *; * of Families, Inheritances, and Business Human Capital," *; * Journal of Labor Economics, 25(2): 289-323. *; * Revised to incorporate sample weights if needed as discussed in: *; * Fairlie, Robert W. 2014. "Addressing Path Dependence and *; * Incorporating Sample Weights in the Nonlinear Blinder-Oaxaca *; * Decomposition Technique for Logit, Probit and Other Nonlinear *; * Models," University of California, Santa Cruz Working Paper. *; ***************************************************************************; * NOTES: * Currently set for use with sample weights, see NOTE:WEIGHTS to remove sample weights; * Currently set for pooled coefficient estimates, see NOTE:POOLED to change to white or minority coefficient estimates; * Currently set for 100 iterations, see NOTE:ITERATIONS to change; * Currently set for Logit model, see NOTE:PROBIT to change; * Example data set can be downloaded at http://people.ucsc.edu/~rfairlie/decomposition/; libname sasdata 'c:\temp\decomp'; options obs=max; options nolabel ls=75 ps=140; * specify number of iterations; * NOTE:ITERATIONS change this to 1000 or higher for final run, but test with fewer; %let numiterations=100; * NOTE:POOLED define race variables to be included only in pooled logits; %let r=4; %let racevars=black latino natamer asian; * specify total number of independent variables, not including race dummies above; %let k=31; * define categories of variables for decomposition; %let group1= female age; %let group2= married prevmar children chld617; %let group3= hsgrad somcol college gradsch; %let group4= inc1015 inc1520 inc2025 inc2530 inc3035 inc3540 inc4050 inc5060 inc6075 incgt75; %let group5= midatlan encent wncent satlan escent wscent mountain pacific; %let group6= notcc notmsa notid; * define short labels for each group of variables for final table output; %let labelgroup1="Gender/Age"; %let labelgroup2="Family"; %let labelgroup3="Education"; %let labelgroup4="Income"; %let labelgroup5="Region"; %let labelgroup6="City"; * rename minority variables; %let mgroup1= mfemale mage; %let mgroup2= mmarried mprevmar mchildre mchld617; %let mgroup3= mhsgrad msomcol mcollege mgradsch; %let mgroup4= minc1015 minc1520 minc2025 minc2530 minc3035 minc3540 minc4050 minc5060 minc6075 mincgt75; %let mgroup5= mmidatla mencent mwncent msatlan mescent mwscent mmountai mpacific; %let mgroup6= mnotcc mnotmsa mnotid; * rename white variables; %let wgroup1= wfemale wage; %let wgroup2= wmarried wprevmar wchildre wchld617; %let wgroup3= whsgrad wsomcol wcollege wgradsch; %let wgroup4= winc1015 winc1520 winc2025 winc2530 winc3035 winc3540 winc4050 winc5060 winc6075 wincgt75; %let wgroup5= wmidatla wencent wwncent wsatlan wescent wwscent wmountai wpacific; %let wgroup6= wnotcc wnotmsa wnotid; * combine groups; %let vars= &group1 &group2 &group3 &group4 &group5 &group6; %let mvars= &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6; %let wvars= &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6; * prepare original data for program; data temp; set sasdata.finaldecomp00; mergeobs=1; * define dependent variable; y=hcomp; * delete observations with any missing values for dependent or independent variables; if y=. or hsgrad=. or inc1015=. then delete; * delete observations with missing, zero or negative weights; if wgt=. or wgt<=0 then delete; * NOTE:WEIGHTS set all weights to 1 to run decomposition without sample weights; *wgt=1; * define sample - e.g. only keep working-age adults for this run; if age<25 or age>55 then delete; * define subset of data for estimating coefficients; * currently set to use all groups (pooled sample) to estimate coefficents; * could be changed to use sample for only one group (e.g. white or minority coefficients); data regdata; set temp; * if white=1 or black=1; * Create title based on sample chosen for data used in regression; title 'Decomposition: Home Computer - Pooled (All Races) Coefficients - Specified Ordering'; * create minority sample with minority variable names; * define which minority group is used; data minority (keep=ym wgtm &mvars mergeobs); set temp; array varsa(&k) &vars; array mvarsa(&k) &mvars; ym=y; wgtm=wgt; do i=1 to &k; mvarsa(i)=varsa(i); end; if black=1 then output; * create full white sample with white variable names; data white (keep=yw wgtw &wvars mergeobs); set temp; array varsa(&k) &vars; array wvarsa(&k) &wvars; yw=y; wgtw=wgt; do i=1 to &k; wvarsa(i)=varsa(i); end; if white=1 then output; * print out full sample means; proc means data=minority; weight wgtm; title2 'Minority Means - Full Sample'; proc means data=white; weight wgtw; title2 'White Means - Full Sample'; * calculate means of dependent variables for full sample; * these values are used to calculate the total gap in the decomposition; proc means data=minority noprint; var ym; weight wgtm; output out=ymdata mean=ymfull n=mn; proc means data=white noprint; var yw; weight wgtw; output out=ywdata mean=ywfull n=wn; * define global variables for full white and full minority sample sizes; data ymdata2; set ymdata; call symput('mobs', mn); data ywdata2; set ywdata; call symput('wobs', wn); run; * estimate logit model to obtain coefficients; * NOTE:PROBIT to change to probit use link=normit; * NOTE:POOLED set for pooled or specific group sample above; proc logistic data=regdata outest=orgcoefs covout descending; model y=&racevars &vars / link=logit; weight wgt / normalize; title2 'Logit for Coefficients'; * remove race dummies from coefficient dataset; * NOTE:POOLED only need this for pooled estimates; data coefs (drop=&racevars _link_ _type_ _status_ _name_ _lnlike_); set orgcoefs; mergeobs=1; if _n_=1; /* coefs are in first row */ * calculate predicted probabilities for both samples; data white; merge white coefs; by mergeobs; array coefsa(&k) &vars; array wvarsa(&k) &wvars; xbeta=intercept; do i=1 to &k; xbeta=xbeta+wvarsa(i)*coefsa(i); end; wordprob=exp(xbeta)/(1+exp(xbeta)); *NOTE:PROBIT use normal distribution for probit; data minority; merge minority coefs; by mergeobs; array coefsa(&k) &vars; array mvarsa(&k) &mvars; xbeta=intercept; do i=1 to &k; xbeta=xbeta+mvarsa(i)*coefsa(i); end; mordprob=exp(xbeta)/(1+exp(xbeta)); *NOTE:PROBIT use normal distribution for probit; * create empty starting dataset for iterations; data means2; set _null_; * create macro for iterations; %macro simulate; %do i=1 %to &numiterations; * create random subsample of whites with same sample size as minorities; proc surveyselect data=white method=pps_wr rep=1 sampsize=&mobs seed=&i out=white1 outhits noprint; size wgtw; * randomly order white subsample prior to matching; data white2; set white1; random1=ranuni(&i); proc sort data=white2; by random1; * create random subsample of blacks with same sample size as minorities; proc surveyselect data=minority method=pps_wr rep=1 sampsize=&mobs seed=&i out=minority1 outhits noprint; size wgtm; * merge datasets together for matching, random matching; data combined; merge white2 minority1; * calculate decomposition components; data combined; set combined; one=1; array coefsa(&k) &vars; * define distribution switches as arrays; array x0a(&k) &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6; array x1a(&k) &mgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6; array x2a(&k) &mgroup1 &mgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6; array x3a(&k) &mgroup1 &mgroup2 &mgroup3 &wgroup4 &wgroup5 &wgroup6; array x4a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &wgroup5 &wgroup6; array x5a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &wgroup6; array x6a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6; xb0=intercept; xb1=intercept; xb2=intercept; xb3=intercept; xb4=intercept; xb5=intercept; xb6=intercept; * perform white to black variable distribution switches; do i=1 to &k; xb0=xb0+x0a(i)*coefsa(i); xb1=xb1+x1a(i)*coefsa(i); xb2=xb2+x2a(i)*coefsa(i); xb3=xb3+x3a(i)*coefsa(i); xb4=xb4+x4a(i)*coefsa(i); xb5=xb5+x5a(i)*coefsa(i); xb6=xb6+x6a(i)*coefsa(i); end; * calculate various predicted probabilities; * NOTE:PROBIT update for normal distribution; pred0=exp(xb0)/(1+exp(xb0)); pred1=exp(xb1)/(1+exp(xb1)); pred2=exp(xb2)/(1+exp(xb2)); pred3=exp(xb3)/(1+exp(xb3)); pred4=exp(xb4)/(1+exp(xb4)); pred5=exp(xb5)/(1+exp(xb5)); pred6=exp(xb6)/(1+exp(xb6)); * calculate various pdfs for standard error calculations; * NOTE:PROBIT update for normal distribution; fhat0=pred0*(1-pred0); fhat1=pred1*(1-pred1); fhat2=pred2*(1-pred2); fhat3=pred3*(1-pred3); fhat4=pred4*(1-pred4); fhat5=pred5*(1-pred5); fhat6=pred6*(1-pred6); * create intercept component to derivatives; dc1db0=fhat0*one-fhat1*one; dc2db0=fhat1*one-fhat2*one; dc3db0=fhat2*one-fhat3*one; dc4db0=fhat3*one-fhat4*one; dc5db0=fhat4*one-fhat5*one; dc6db0=fhat5*one-fhat6*one; * calculate contribution derivatives (delta method); array dc1dba(&k) dc1db1-dc1db&k; array dc2dba(&k) dc2db1-dc2db&k; array dc3dba(&k) dc3db1-dc3db&k; array dc4dba(&k) dc4db1-dc4db&k; array dc5dba(&k) dc5db1-dc5db&k; array dc6dba(&k) dc6db1-dc6db&k; * create other variable components to derivatives; do i=1 to &k; dc1dba(i)=fhat0*x0a(i)-fhat1*x1a(i); dc2dba(i)=fhat1*x1a(i)-fhat2*x2a(i); dc3dba(i)=fhat2*x2a(i)-fhat3*x3a(i); dc4dba(i)=fhat3*x3a(i)-fhat4*x4a(i); dc5dba(i)=fhat4*x4a(i)-fhat5*x5a(i); dc6dba(i)=fhat5*x5a(i)-fhat6*x6a(i); end; *****************************; * calculate standard errors *; *****************************; * clean up coefficient/covariance dataset; data covar (drop=&racevars _link_ _type_ _status_ _name_ _lnlike_); set orgcoefs; * delete coefs and rows associated with racevars; if _n_=1 or (3<=_n_<=(&r+2)) then delete; * calculate decomposition estimates to save and use for variance calculations; proc means data=combined noprint; var yw ym pred0-pred6 dc1db0 dc1db1-dc1db&k dc2db0 dc2db1-dc2db&k dc3db0 dc3db1-dc3db&k dc4db0 dc4db1-dc4db&k dc5db0 dc5db1-dc5db&k dc6db0 dc6db1-dc6db&k; output out=means1 mean=; * create separate datasets to read into proc iml; * NOTE: check to make sure variables are in the proper order to match to the covariance matrix; data cont1 (keep=dc1db0 dc1db1-dc1db&k) cont2 (keep=dc2db0 dc2db1-dc2db&k) cont3 (keep=dc3db0 dc3db1-dc3db&k) cont4 (keep=dc4db0 dc4db1-dc4db&k) cont5 (keep=dc5db0 dc5db1-dc5db&k) cont6 (keep=dc6db0 dc6db1-dc6db&k); set means1; proc iml; use covar; read all var _num_ into V; use cont1; read all var _num_ into DC1DB; use cont2; read all var _num_ into DC2DB; use cont3; read all var _num_ into DC3DB; use cont4; read all var _num_ into DC4DB; use cont5; read all var _num_ into DC5DB; use cont6; read all var _num_ into DC6DB; * calculate standard error; VAR1=DC1DB*V*t(DC1DB); VAR2=DC2DB*V*t(DC2DB); VAR3=DC3DB*V*t(DC3DB); VAR4=DC4DB*V*t(DC4DB); VAR5=DC5DB*V*t(DC5DB); VAR6=DC6DB*V*t(DC6DB); create vardata var {var1 var2 var3 var4 var5 var6}; append; * merge variance calculations from proc iml to decomp dataset; data means1 (keep=yw ym pred0-pred6 var1-var6); merge means1 vardata; * append latest iteration results to all previous iteration results; data means2; set means2 means1; run; %end; %mend; run; * turn off notes because macro generates a lot of information, remove this option for debugging program; options nonotes; * run simulation - note that it runs numiterations times because of do loop above; %simulate; run; * calculate contribution estimates from changes in predicted probabilities; data means2; set means2; cont1=pred0-pred1; cont2=pred1-pred2; cont3=pred2-pred3; cont4=pred3-pred4; cont5=pred4-pred5; cont6=pred5-pred6; cont7=pred6-pred7; cont8=pred7-pred8; * calculate means of decomposition runs; proc means data=means2; title2 'Mean Values of Contribution Estimates from Simulations'; var yw pred0 pred1-pred6 ym cont1-cont6 var1-var6; output out=meandecomp mean=; * append the full sample means for ys and calculate percent contributions; * dataset now has only one obs for the means; data meandecomp; merge meandecomp ywdata ymdata; gap=ywfull-ymfull; perc1=cont1/gap; perc2=cont2/gap; perc3=cont3/gap; perc4=cont4/gap; perc5=cont5/gap; perc6=cont6/gap; se1=sqrt(var1); se2=sqrt(var2); se3=sqrt(var3); se4=sqrt(var4); se5=sqrt(var5); se6=sqrt(var6); label cont1=&labelgroup1; label cont2=&labelgroup2; label cont3=&labelgroup3; label cont4=&labelgroup4; label cont5=&labelgroup5; label cont6=&labelgroup6; options label; * format output for final decomposition table; * outputs contribution estimates, gap percents and standard errors; proc means data=meandecomp mean; title2 'Final Output for Table - Means Values for Contribution Estimates'; var ywfull ymfull gap cont1 se1 perc1 cont2 se2 perc2 cont3 se3 perc3 cont4 se4 perc4 cont5 se5 perc5 cont6 se6 perc6; run;