***************************************************************************; * Non-Linear Decomposition Technique for Logit or Probit Model *; * Randomized Ordering of Variables - Addresses Path Dependence *; * Updated on 6/9/15 *; * Originally developed in: *; * Fairlie, Robert W. 1999. "The Absence of the African-American Owned *; * Business: An Analysis of the Dynamics of Self-Employment," *; * Journal of Labor Economics, 17(1): 80-108. *; * Revised to randomly match black/white distributions in: *; * Fairlie, Robert W., and Alicia M. Robb. 2007. "Why are Black-Owned *; * Businesses Less Successful than White-Owned Businesses: The Role *; * of Families, Inheritances, and Business Human Capital," *; * Journal of Labor Economics, 25(2): 289-323. *; * Revised to randomize variable ordering and incorporate sample weights *; * if needed as discussed in: *; * Fairlie, Robert W. 2014. "Addressing Path Dependence and *; * Incorporating Sample Weights in the Nonlinear Blinder-Oaxaca *; * Decomposition Technique for Logit, Probit and Other Nonlinear *; * Models," University of California, Santa Cruz Working Paper. *; ***************************************************************************; * NOTES: * Currently set for use with sample weights, see NOTE:WEIGHTS to remove sample weights; * Currently set for pooled coefficient estimates, see NOTE:POOLED to change to white or minority coefficient estimates; * Currently set for 100 iterations, see NOTE:ITERATIONS to change; * Currently set for Logit model, see NOTE:PROBIT to change; * Example data set can be downloaded at http://people.ucsc.edu/~rfairlie/decomposition/; libname sasdata 'c:\temp\decomp'; options obs=max; options nolabel ls=75 ps=140; * specify number of iterations; * NOTE:ITERATIONS change this to 1000 or higher for final run, but test with fewer; %let numiterations=100; * NOTE:POOLED define race variables to be included only in pooled logits; %let r=4; %let racevars=black latino natamer asian; * specify total number of independent variables, not including race dummies above; %let k=31; * define categories of variables for decomposition; %let definegroup1 = female age; %let definegroup2 = married prevmar children chld617; %let definegroup3 = hsgrad somcol college gradsch; %let definegroup4 = inc1015 inc1520 inc2025 inc2530 inc3035 inc3540 inc4050 inc5060 inc6075 incgt75; %let definegroup5 = midatlan encent wncent satlan escent wscent mountain pacific; %let definegroup6 = notcc notmsa notid; * define short labels for each group of variables for final table output; %let labelgroup1="Gender/Age"; %let labelgroup2="Family"; %let labelgroup3="Education"; %let labelgroup4="Income"; %let labelgroup5="Region"; %let labelgroup6="City"; * rename minority variables; %let mdefinegroup1 = mfemale mage; %let mdefinegroup2 = mmarried mprevmar mchildre mchld617; %let mdefinegroup3 = mhsgrad msomcol mcollege mgradsch; %let mdefinegroup4 = minc1015 minc1520 minc2025 minc2530 minc3035 minc3540 minc4050 minc5060 minc6075 mincgt75; %let mdefinegroup5 = mmidatla mencent mwncent msatlan mescent mwscent mmountai mpacific; %let mdefinegroup6 = mnotcc mnotmsa mnotid; * rename white variables; %let wdefinegroup1 = wfemale wage; %let wdefinegroup2 = wmarried wprevmar wchildre wchld617; %let wdefinegroup3 = whsgrad wsomcol wcollege wgradsch; %let wdefinegroup4 = winc1015 winc1520 winc2025 winc2530 winc3035 winc3540 winc4050 winc5060 winc6075 wincgt75; %let wdefinegroup5 = wmidatla wencent wwncent wsatlan wescent wwscent wmountai wpacific; %let wdefinegroup6 = wnotcc wnotmsa wnotid; * combine groups; %let vars= &definegroup1 &definegroup2 &definegroup3 &definegroup4 &definegroup5 &definegroup6; %let mvars= &mdefinegroup1 &mdefinegroup2 &mdefinegroup3 &mdefinegroup4 &mdefinegroup5 &mdefinegroup6; %let wvars= &wdefinegroup1 &wdefinegroup2 &wdefinegroup3 &wdefinegroup4 &wdefinegroup5 &wdefinegroup6; * prepare original data for program; data temp; set sasdata.finaldecomp00; mergeobs=1; * define dependent variable; y=hcomp; * delete observations with any missing values for dependent or independent variables; if y=. or hsgrad=. or inc1015=. then delete; * delete observations with missing, zero or negative weights; if wgt=. or wgt<=0 then delete; * NOTE:WEIGHTS set all weights to 1 to run decomposition without sample weights; *wgt=1; * define sample - e.g. only keep working-age adults for this run; if age<25 or age>55 then delete; * define subset of data for estimating coefficients; * currently set to use all groups (pooled sample) to estimate coefficents; * could be changed to use sample for only one group (e.g. white or minority coefficients); data regdata; set temp; * if white=1 or black=1; * Create title based on sample chosen for data used in regression; title 'Decomposition: Home Computer - Pooled (All Races) Coefficients - Random Ordering'; * create minority sample with minority variable names; * define which minority group is used; data minority (keep=ym wgtm &mvars mergeobs); set temp; array varsa(&k) &vars; array mvarsa(&k) &mvars; ym=y; wgtm=wgt; do i=1 to &k; mvarsa(i)=varsa(i); end; if black=1 then output; * create full white sample with white variable names; data white (keep=yw wgtw &wvars mergeobs); set temp; array varsa(&k) &vars; array wvarsa(&k) &wvars; yw=y; wgtw=wgt; do i=1 to &k; wvarsa(i)=varsa(i); end; if white=1 then output; * print out full sample means; proc means data=minority; weight wgtm; title2 'Minority Means - Full Sample'; proc means data=white; weight wgtw; title2 'White Means - Full Sample'; * calculate means of dependent variables for full sample; * these values are used to calculate the total gap in the decomposition; proc means data=minority noprint; var ym; weight wgtm; output out=ymdata mean=ymfull n=mn; proc means data=white noprint; var yw; weight wgtw; output out=ywdata mean=ywfull n=wn; * define global variables for full white and full minority sample sizes; data ymdata2; set ymdata; call symput('mobs',mn); run; data ywdata2; set ywdata; call symput('wobs',wn); run; * estimate logit model to obtain coefficients; * NOTE:PROBIT to change to probit use link=normit; * NOTE:POOLED set for pooled or specific group sample above; proc logistic data=regdata outest=orgcoefs covout descending; model y=&racevars &vars / link=logit; weight wgt / normalize; title2 'Logit for Coefficients'; run; * remove race dummies from coefficient dataset; * NOTE:POOLED only need this for pooled estimates; data coefs (drop=&racevars _link_ _type_ _status_ _name_ _lnlike_); set orgcoefs; mergeobs=1; if _n_=1; /* coefs are in first row */ * calculate predicted probabilities for both samples; data white; merge white coefs; by mergeobs; array coefsa(&k) &vars; array wvarsa(&k) &wvars; xbeta=intercept; do i=1 to &k; xbeta=xbeta+wvarsa(i)*coefsa(i); end; wordprob=exp(xbeta)/(1+exp(xbeta)); *NOTE:PROBIT use normal distribution for probit; data minority; merge minority coefs; by mergeobs; array coefsa(&k) &vars; array mvarsa(&k) &mvars; xbeta=intercept; do i=1 to &k; xbeta=xbeta+mvarsa(i)*coefsa(i); end; mordprob=exp(xbeta)/(1+exp(xbeta)); *NOTE:PROBIT use normal distribution for probit; * create empty starting dataset for iterations; data means2; set _null_; * create macro for iterations; %macro simulate; %do i=1 %to &numiterations; * create random subsample of whites with same sample size as minorities; proc surveyselect data=white method=pps_wr rep=1 sampsize=&mobs seed=&i out=white1 outhits noprint; size wgtw; * randomly order white subsample prior to matching; data white2; set white1; random1=ranuni(&i); proc sort data=white2; by random1; * create random subsample of minorities with same sample size as minorities; proc surveyselect data=minority method=pps_wr rep=1 sampsize=&mobs seed=&i out=minority1 outhits noprint; size wgtm; * merge datasets together for matching, random matching; data combined; merge white2 minority1; one=1; *randomly assign variable groups ("defined groups") to "number groups" for random ordering of variable groups; data rand; do i=1 to 6; r = ranuni(&i); output; end; proc sort data=rand; by r; data orderiterationtemp; set rand; group = catt("group",i); mgroup = catt("mgroup",i); wgroup = catt("wgroup",i); if _n_ = 1 then do; call symput(group,"&definegroup1"); call symput(mgroup,"&mdefinegroup1"); call symput(wgroup,"&wdefinegroup1"); end; else if _n_ = 2 then do; call symput(group,"&definegroup2"); call symput(mgroup,"&mdefinegroup2"); call symput(wgroup,"&wdefinegroup2"); end; else if _n_ = 3 then do; call symput(group,"&definegroup3"); call symput(mgroup,"&mdefinegroup3"); call symput(wgroup,"&wdefinegroup3"); end; else if _n_ = 4 then do; call symput(group,"&definegroup4"); call symput(mgroup,"&mdefinegroup4"); call symput(wgroup,"&wdefinegroup4"); end; else if _n_ = 5 then do; call symput(group,"&definegroup5"); call symput(mgroup,"&mdefinegroup5"); call symput(wgroup,"&wdefinegroup5"); end; else do; call symput(group,"&definegroup6"); call symput(mgroup,"&mdefinegroup6"); call symput(wgroup,"&wdefinegroup6"); end; array ordgroup {6} ordgroup1 ordgroup2 ordgroup3 ordgroup4 ordgroup5 ordgroup6; do j = 1 to 6; if _n_ = j then ordgroup[j] = i; end; one = 1; * create dataset recording which original variable group is assigned to each randomly ordered group; proc means data = orderiterationtemp noprint; var ordgroup1 ordgroup2 ordgroup3 ordgroup4 ordgroup5 ordgroup6 one; output out = orderiteration mean = ; * clean up dataset; data orderiteration (drop = _type_ _freq_); set orderiteration; * append information about random order of original variable groups; data combined; merge combined orderiteration; by one; * calculate decomposition components; data combined; set combined; * variables and coefficients need to be in new random order for iteration; %let newvars= &group1 &group2 &group3 &group4 &group5 &group6; array newcoefsa (&k) &group1 &group2 &group3 &group4 &group5 &group6; * define distribution switches as arrays; array x0a(&k) &wgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6; array x1a(&k) &mgroup1 &wgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6; array x2a(&k) &mgroup1 &mgroup2 &wgroup3 &wgroup4 &wgroup5 &wgroup6; array x3a(&k) &mgroup1 &mgroup2 &mgroup3 &wgroup4 &wgroup5 &wgroup6; array x4a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &wgroup5 &wgroup6; array x5a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &wgroup6; array x6a(&k) &mgroup1 &mgroup2 &mgroup3 &mgroup4 &mgroup5 &mgroup6; xb0=intercept; xb1=intercept; xb2=intercept; xb3=intercept; xb4=intercept; xb5=intercept; xb6=intercept; * perform white to minority variable distribution switches; do i=1 to &k; xb0=xb0+x0a(i)*newcoefsa(i); xb1=xb1+x1a(i)*newcoefsa(i); xb2=xb2+x2a(i)*newcoefsa(i); xb3=xb3+x3a(i)*newcoefsa(i); xb4=xb4+x4a(i)*newcoefsa(i); xb5=xb5+x5a(i)*newcoefsa(i); xb6=xb6+x6a(i)*newcoefsa(i); end; * calculate various predicted probabilities; * NOTE:PROBIT update for normal distribution; pred0=exp(xb0)/(1+exp(xb0)); pred1=exp(xb1)/(1+exp(xb1)); pred2=exp(xb2)/(1+exp(xb2)); pred3=exp(xb3)/(1+exp(xb3)); pred4=exp(xb4)/(1+exp(xb4)); pred5=exp(xb5)/(1+exp(xb5)); pred6=exp(xb6)/(1+exp(xb6)); * calculate various pdfs for standard error calculations; * NOTE:PROBIT update for normal distribution; fhat0=pred0*(1-pred0); fhat1=pred1*(1-pred1); fhat2=pred2*(1-pred2); fhat3=pred3*(1-pred3); fhat4=pred4*(1-pred4); fhat5=pred5*(1-pred5); fhat6=pred6*(1-pred6); * create intercept component to derivatives; dc1db0=fhat0*one-fhat1*one; dc2db0=fhat1*one-fhat2*one; dc3db0=fhat2*one-fhat3*one; dc4db0=fhat3*one-fhat4*one; dc5db0=fhat4*one-fhat5*one; dc6db0=fhat5*one-fhat6*one; * calculate contribution derivatives (delta method); array dc1dba(&k) dc1db1-dc1db&k; array dc2dba(&k) dc2db1-dc2db&k; array dc3dba(&k) dc3db1-dc3db&k; array dc4dba(&k) dc4db1-dc4db&k; array dc5dba(&k) dc5db1-dc5db&k; array dc6dba(&k) dc6db1-dc6db&k; * create other variable components to derivatives; do i=1 to &k; dc1dba(i)=fhat0*x0a(i)-fhat1*x1a(i); dc2dba(i)=fhat1*x1a(i)-fhat2*x2a(i); dc3dba(i)=fhat2*x2a(i)-fhat3*x3a(i); dc4dba(i)=fhat3*x3a(i)-fhat4*x4a(i); dc5dba(i)=fhat4*x4a(i)-fhat5*x5a(i); dc6dba(i)=fhat5*x5a(i)-fhat6*x6a(i); end; *****************************; * calculate standard errors *; *****************************; * re-estimate logit model with new random variable ordering for covariance matrix; * NOTE:PROBIT to change to probit use link=normit; * NOTE:POOLED set for pooled or specific group sample above; proc logistic data=regdata outest=orgcoefs2 covout descending noprint; model y=&racevars &newvars / link=logit; weight wgt / normalize; * clean up coefficient/covariance dataset; data covar (drop=&racevars _link_ _type_ _status_ _name_ _lnlike_); set orgcoefs2; * delete coefs and rows associated with racevars; if _n_=1 or (3<=_n_<=(&r+2)) then delete; * calculate decomposition estimates to save and use for variance calculations; proc means data=combined noprint; var yw ym pred0-pred6 dc1db0 dc1db1-dc1db&k dc2db0 dc2db1-dc2db&k dc3db0 dc3db1-dc3db&k dc4db0 dc4db1-dc4db&k dc5db0 dc5db1-dc5db&k dc6db0 dc6db1-dc6db&k ordgroup1-ordgroup6; output out=means1 mean=; * create separate datasets to read into proc iml; * NOTE: check to make sure variables are in the proper order to match to the covariance matrix; data cont1 (keep=dc1db0 dc1db1-dc1db&k) cont2 (keep=dc2db0 dc2db1-dc2db&k) cont3 (keep=dc3db0 dc3db1-dc3db&k) cont4 (keep=dc4db0 dc4db1-dc4db&k) cont5 (keep=dc5db0 dc5db1-dc5db&k) cont6 (keep=dc6db0 dc6db1-dc6db&k); set means1; proc iml; use covar; read all var _num_ into V; use cont1; read all var _num_ into DC1DB; use cont2; read all var _num_ into DC2DB; use cont3; read all var _num_ into DC3DB; use cont4; read all var _num_ into DC4DB; use cont5; read all var _num_ into DC5DB; use cont6; read all var _num_ into DC6DB; * calculate standard error; VAR1=DC1DB*V*t(DC1DB); VAR2=DC2DB*V*t(DC2DB); VAR3=DC3DB*V*t(DC3DB); VAR4=DC4DB*V*t(DC4DB); VAR5=DC5DB*V*t(DC5DB); VAR6=DC6DB*V*t(DC6DB); create vardata var {var1 var2 var3 var4 var5 var6}; append; * merge variance calculations from proc iml to decomp dataset; data means1 (keep=yw ym pred0-pred6 var1-var6 cont1-cont6 ordgroup1-ordgroup6); merge means1 vardata; cont1=pred0-pred1; cont2=pred1-pred2; cont3=pred2-pred3; cont4=pred3-pred4; cont5=pred4-pred5; cont6=pred5-pred6; * record orginal variable group contributions from random order variable group contributions; data means1 (keep = yw ym contg1-contg6 vargroup1-vargroup6); set means1; array ordgroupa{6} ordgroup1-ordgroup6; array contgroupa{6} contg1-contg6; array vargroupa{6} vargroup1-vargroup6; array conta{6} cont1-cont6; array vara{6} var1-var6; do j = 1 to 6; do i = 1 to 6; if ordgroupa[j] = i then do; contgroupa[j] = conta[i]; vargroupa[j] = vara[i]; end; end; end; * append latest iteration results to all previous iterations results; data means2; set means2 means1; run; %end; %mend; run; * turn off notes because macro generates a lot of information, remove this option for debugging program; options nonotes; * run simulation - note that it runs numiterations times because of do loop above; %simulate; run; * calculate means of decomposition runs; proc means data=means2; title2 'Mean Values of Contribution Estimates from Simulations'; var yw ym contg1-contg6 vargroup1-vargroup6; output out=meandecomp mean=; * append the full sample means for ys and calculate percent contributions; * dataset now has only one obs for the means; data meandecomp; merge meandecomp ywdata ymdata; gap=ywfull-ymfull; perc1=contg1/gap; perc2=contg2/gap; perc3=contg3/gap; perc4=contg4/gap; perc5=contg5/gap; perc6=contg6/gap; se1=sqrt(vargroup1); se2=sqrt(vargroup2); se3=sqrt(vargroup3); se4=sqrt(vargroup4); se5=sqrt(vargroup5); se6=sqrt(vargroup6); label contg1=&labelgroup1; label contg2=&labelgroup2; label contg3=&labelgroup3; label contg4=&labelgroup4; label contg5=&labelgroup5; label contg6=&labelgroup6; options label; * format output for final decomposition table; * outputs contribution estimates, gap percents and standard errors; proc means data=meandecomp mean; title2 'Final Output for Table - Means Values for Contribution Estimates'; var ywfull ymfull gap contg1 se1 perc1 contg2 se2 perc2 contg3 se3 perc3 contg4 se4 perc4 contg5 se5 perc5 contg6 se6 perc6; run;