/**********************************************************
** Title: 01_Assemble_data_for_analysis.do 
** Date: March 15, 2007
** Authors: Sanford Gordon, Dimitri Landa, Gregory Huber
***********************************************************/

/*Specify path for 2000 NAES Source Data files Here (already in stata format). Don't include trailing slash in path.*/
local NAESPath="C:\Documents\Work\Published Papers\AJPS Campaign Advertising Effects\NAES\RawNAESDataInStataFormat"
pause on

#delimit;
set more off;
clear;
set mem 350m;
capture log close;
log using "logs\01_assemble_data_for_analysis.log", replace;

/**************
*Step 1. Combine 2000 Challenges by sitting state legislators with district demographcis and recoded challenger name recognition data
**************/

*get state legislators per member data;
insheet using rawdata\StateLegislatorsPerHouseMember.csv;
label var legperhousemem "State legislators per Representative";
sort st;
save data\StateLegislatorsPerHouseMember.dta, replace;
clear;

*get state legislative professionalism data;
insheet using rawdata\LegislativeProfessionalismFromKing.csv;
label var legprof "Legislative Professionalism (from King)";
sort st;
save data\LegislativeProfessionalism.dta, replace;
clear;

*get challengers list;
insheet using rawdata\challenges_by_sitting_state_legislators_2000.csv;

/**************************
Important data coding note:
***************************
Shaw, E. Clay, Jr., FL 22 was unchallenged in 1998, so dvp is missing in Jacobson data. Looking at his 1996 and 1994 victories, however, in both cases
he won with similar margins. In 1996, dvp would be 38.1, which is the value we use.
*/

*gen chal party dummy;
gen chalparty=1;
replace chalparty=-1 if deminc==0;
label var chalparty "Chal is Dem. (-1) or Rep. (1)";

*incumbent's vote share;
label var dvp "Democrat's vote share in prev. election";
label var dpres "Democratic presidential candidate's vote share in prev. election";

*label misc. variables;
label var fr "Incumbent is Freshman";
label variable deminc "Republican Challenger";
label variable termlaw "State has Term Limit";
gen chalpartytermlaw=termlaw*chalparty; 
label var chalpartytermlaw "Party Adjusted State has Term Limit";

*Code term limited challenger variables;
label var termed "Challenger Term Limited";
gen demchaltermed=0;
replace demchaltermed=1 if deminc==0 & termed==1;
label variable demchaltermed "Democratic Challenger Term Limited";

gen gopchaltermed=0;
replace gopchaltermed=1 if deminc==1 & termed==1;
label variable gopchaltermed "Republican Challenger Term Limited";
gen istermed=termed*chalparty;
*label var istermed "Termed Challenger x Challenger Party (-1=Dem. Chal. Termed, 0=Chal. Not Termed, 1=Rep. Chal. Termed)";
label var istermed "Party Adjusted Termed Challenger";

*Code high opportunity cost variables;
rename alt_high_op chal_resign;
label var chal_resign "Challenger resigned or didn't run for old office";
gen party_chal_resign= chal_resign*chalparty;
label var party_chal_resign "Challenger resigned or didn't run for old office x Challenger Party";

*code expenditures to incumbent and challenger depending on party of each;
gen incexp = dexp*deminc + rexp*(1-deminc);
gen chalexp = dexp*(1-deminc)+rexp*deminc;
label var incexp "Incumbent's expenditures (FEC)";
label var chalexp "Challenger's expenditures (FEC)";

egen rdist = group(st district);
label var rdist "District indicator";

*freshman dem or rep?;
gen dem_fresh=0;
replace dem_fresh=1 if deminc==1 & fr==1;
label var dem_fresh "Incumbent Democratic Freshman";
gen gop_fresh=0;
replace gop_fresh=1 if deminc==0 & fr==1;
label var gop_fresh "Incumbent Republican Freshman";

*generate challenger experience variable;
label var currexp "Challenger's current experience";
gen challexperience=currexp;
replace challexperience=challexperience*-1 if deminc==1;
label var challexperience "Challenger Experience x Challenger Party";

count;
sort st;
joinby st using data\StateLegislatorsPerHouseMember.dta;
count;

*generate directional measure of state legislators per house member;
gen lphous_chalpart=legperhousemem*chalparty;
label var lphous_chalpart "State legislators per Representative x Challenger Party";

count;
sort st;
joinby st using data\LegislativeProfessionalism.dta;
count;

*generate directional measure of leg prof;
gen legprof_chalpart=legprof*chalparty;
label var legprof_chalpart "Legislative professionalism x Challenger Party";

save data\challenges_by_sitting_state_legislators.dta, replace;
clear;

/**************
* Step 2. Load NAES data, merge with data on races
**************/

*note: Path to NAES source in local var NAESPath defined above;
*start by loading in panel dataset for folks interviewed 1st time outside window, but second time within it.;
*For them, recode reinterview variables to first interview;
use "`NAESPath'\elecpnl.dta";
qui append using "`NAESPath'\deb03oct.dta";
qui append using "`NAESPath'\deb11oct.dta";
qui append using "`NAESPath'\deb17oct.dta";
qui append using "`NAESPath'\dempnl.dta";
qui append using "`NAESPath'\goppnl.dta";

*get rid of panel folks interviewed first time in our window. those people are already in cross sectional datasets;
drop if cdate>=20001001;
*now get rid of people whose second interview was outside our window;
drop if rdate<20001001;
drop if rdate>20001130;
*some folks are interviewed twice, take their pre-election interview;
sort ckey rdate;
drop if ckey==ckey[_n-1] & rdate>20001107;
*This winnowing already eliminates all duplicates, but just to be sure, check;
duplicates report ckey cdate;
*now recode all the r(second interview) variables to the first interview measures;
drop cdate;
rename rdate cdate;
drop cv01 cv02 cv03;
rename rv01 cv01;
rename rv02 cv02;
rename rv03 cv03;
drop cu09 cu09_ cu10 cu12 cy06;
rename ru09 cu09;
rename ru09_ cu09_;
rename ru10 cu10;
rename ru12 cu12;
rename ry06 cy06;
drop r*;
save "data\recodedpanelrespondents.dta", replace;

clear;

use "`NAESPath'\cs000403.dta";
qui append using "`NAESPath'\cs000717.dta";
qui append using "`NAESPath'\cs000904.dta";
qui append using "`NAESPath'\cs001002.dta";
qui append using "`NAESPath'\cs001106.dta";
qui append using "`NAESPath'\cs010119.dta";
*get rid of people whose interview was outside our window;
drop if cdate<20001001;
drop if cdate>20001130;
qui append using "data\recodedpanelrespondents.dta";

compress;

keep cint-cst ce* cf* ck* cr* cu* cv* cw* cy*;

rename cu01 district;
rename cst st;

sort st district;
joinby st district using "data\challenges_by_sitting_state_legislators.dta";

/**************
* Step 3. Recode NAES variables
**************/

codebook cy06;

/*

--------------------------------------------------------------------------------------------------------------------------------------------
cy06                                                                                    interviewer judge of respondents political knowledge
--------------------------------------------------------------------------------------------------------------------------------------------

                  type:  numeric (byte)
                 label:  cy06

                 range:  [1,5]                        units:  1
         unique values:  5                        missing .:  0/1309

            tabulation:  Freq.   Numeric  Label
                           273         1  a
                           526         2  b
                           383         3  c
                            93         4  d
                            34         5  f

*/

gen knowledge=(-1*cy06)+4;
replace knowledge=0 if knowledge<0;
label var knowledge "Political Knowledge";

/*

----------------------------------------------------------------------------------------------------------------
cv01                                                                                                    party id
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cv01

                 range:  [1,999]                      units:  1
         unique values:  6                        missing .:  0/51356

            tabulation:  Freq.   Numeric  Label
                         14243         1  republican
                         15874         2  democrat
                         14818         3  independent
                          4410         4  verbatim
                          1672       998  dont know
                           339       999  no answer

----------------------------------------------------------------------------------------------------------------
cv02                                                                                             party id strong
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cv02

                 range:  [1,999]                      units:  1
         unique values:  4                        missing .:  6421/51356

            tabulation:  Freq.   Numeric  Label
                         23598         1  strong
                         20394         2  not very strong
                           805       998  dont know
                           138       999  no answer
                          6421         .  

----------------------------------------------------------------------------------------------------------------
cv03                                                                               lean republican or democratic
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cv03

                 range:  [1,999]                      units:  1
         unique values:  5                        missing .:  30117/51356

            tabulation:  Freq.   Numeric  Label
                          6665         1  republican
                          7427         2  democratic
                          5553         3  neither
                          1327       998  dont know
                           267       999  no answer
                         30117         .  

*/

*pid5 is coded +2=Strong Rep, +1=Leaners and Weak Partisans, 0=Independents, Flip for Democrats.
*Note that Don't know/No Answer on both cv01 and cv03=missing;

gen pid5=cv01;
recode pid5 (1=2) (2=-2) (3=0) (4=0) (998=0) (999=.);
replace pid5=-1 if pid5==-2 & cv02~=1;
replace pid5=1 if pid5==2 & cv02~=1;
replace pid5=-1 if cv03==2;
replace pid5=1 if cv03==1;

label variable pid5 "Partisan Identification (5 Point Scale, -2 strong Dem, +2 strong GOP)";

*create pid5 variable scaled to party of incumbent;
gen incumb_pid5 = chalparty*(pid5);
label variable incumb_pid5 "Partisan Affinity with Challenger";

gen abspid=abs(pid5);
label var abspid "Absolute value of Partisan Identification";

/*

----------------------------------------------------------------------------------------------------------------
cw01                                                                                                         sex
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (byte)
                 label:  cw01

                 range:  [1,2]                        units:  1
         unique values:  2                        missing .:  0/51356

            tabulation:  Freq.   Numeric  Label
                         22955         1  male
                         28401         2  female

----------------------------------------------------------------------------------------------------------------
cw02                                                                                                         age
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw02, but 80 nonmissing values are not labeled

                 range:  [18,999]                     units:  1
         unique values:  82                       missing .:  0/51356

              examples:  30    
                         40    
                         49    
                         61    

*/

gen female=cw01-1;
label var female "Female";

gen age=cw02;
*values of age greater than 900 are dk/no answer;
recode age 900/max=.;
label var age "Age in years";

gen age2=age^2;
label var age2 "Age squared";

/*

----------------------------------------------------------------------------------------------------------------
cw03                                                                                                        race
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw03

                 range:  [1,999]                      units:  1
         unique values:  6                        missing .:  0/51356

            tabulation:  Freq.   Numeric  Label
                         42173         1  white
                          4511         2  black
                           853         3  asian
                          3052         4  verbatim
                           247       998  dont know
                           520       999  no answer

----------------------------------------------------------------------------------------------------------------
cw04                                                                               of hispanic or spanish origin
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw04

                 range:  [1,999]                      units:  1
         unique values:  4                        missing .:  0/51356

            tabulation:  Freq.   Numeric  Label
                          4048         1  yes
                         46960         2  no
                           100       998  dont know
                           248       999  no answer

*/

gen white=cw03;
recode white 1=1 2/4=0 *=.;
label var white "White";

gen hispanic=0;
replace hispanic=1 if cw04==1;
replace hispanic=. if cw04>900;
label var hispanic "Hispanic";

/*

----------------------------------------------------------------------------------------------------------------
cw06                                                                                                   education
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw06

                 range:  [1,999]                      units:  1
         unique values:  11                       missing .:  0/51356


                              education |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
                   grade eight or lower |      1,455        2.83        2.83
           some high school, no diploma |      3,273        6.37        9.21
      high school diploma or equivalent |     14,597       28.42       37.63
technical or vocational school after hi |      1,404        2.73       40.36
                some college, no degree |      8,914       17.36       57.72
  associates or two-year college degree |      4,503        8.77       66.49
               four-year college degree |      9,152       17.82       84.31
graduate or professional school after c |      1,518        2.96       87.27
        graduate or professional degree |      6,247       12.16       99.43
                              dont know |         65        0.13       99.56
                              no answer |        228        0.44      100.00
----------------------------------------+-----------------------------------
                                  Total |     51,356      100.00

----------------------------------------------------------------------------------------------------------------
cw07                                                                                                  us citizen
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw07

                 range:  [1,999]                      units:  1
         unique values:  4                        missing .:  0/51356

            tabulation:  Freq.   Numeric  Label
                         49542         1  yes
                          1706         2  no
                            11       998  dont know
                            97       999  no answer
*/

gen education =cw06;
recode education 1=1 2=1 3=2 4=3 5=3 6=3 7=4 8=4 9=5 998/999=.;
label var education "Education";

gen alteducation =cw06;
recode alteducation 998/999=.;
label var alteducation "Education";

gen us_citizen=cw07;
recode us_citizen 2=0 998/999=.;

/*

----------------------------------------------------------------------------------------------------------------
cw14                                                                                   attend religious services
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw14

                 range:  [1,999]                      units:  1
         unique values:  7                        missing .:  0/51356

            tabulation:  Freq.   Numeric  Label
                          6615         1  more than once a week
                         14250         2  once a week
                          7982         3  once or twice a month
                         13838         4  few times a year
                          8242         5  never
                           107       998  dont know
                           322       999  no answer

*/


gen religious_attend=cw14;
recode religious_attend 1=4 2=3 3=2 4=1 5=0 *=.;
label var religious_attend "Religious services attendance";

/*

----------------------------------------------------------------------------------------------------------------
cw28                                                                                            household income
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw28

                 range:  [1,999]                      units:  1
         unique values:  11                       missing .:  0/51356

              household income |      Freq.     Percent        Cum.
-------------------------------+-----------------------------------
             less than $10,000 |      3,095        6.03        6.03
  $10,000 to less than $15,000 |      3,103        6.04       12.07
  $15,000 to less than $25,000 |      5,795       11.28       23.35
  $25,000 to less than $35,000 |      6,795       13.23       36.58
  $35,000 to less than $50,000 |      8,649       16.84       53.43
  $50,000 to less than $75,000 |      8,723       16.99       70.41
 $75,000 to less than $100,000 |      4,828        9.40       79.81
$100,000 to less than $150,000 |      3,055        5.95       85.76
              $150,000 or more |      1,863        3.63       89.39
                     dont know |      2,115        4.12       93.51
                     no answer |      3,335        6.49      100.00
-------------------------------+-----------------------------------
                         Total |     51,356      100.00

----------------------------------------------------------------------------------------------------------------
cw29                                                                                       labor union household
----------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cw29

                 range:  [1,999]                      units:  1
         unique values:  4                        missing .:  0/51356

            tabulation:  Freq.   Numeric  Label
                          8129         1  yes
                         42809         2  no
                           311       998  dont know
                           107       999  no answer

*/

gen income = cw28;
recode income 998/999=0;
label var income "Income";

gen income_dkna=0;
replace income_dkna=1 if cw28==998 | cw28==999;
label var income_dkna "Income Refused or Don't Know";

gen union=cw29;
recode union 2=0 998=0 999=.;
label var union "Union household";

/*

-------------------------------------------------------------------------------------------------------------
cu10                                                  favorability: incumbent candidate for us representative
-------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cu10, but 40 nonmissing values are not labeled

                 range:  [0,999]                      units:  1
         unique values:  43                       missing .:  1949/3526

              examples:  70    
                         102   cannot rate
                         .     
                         .     

101 is do not recog, 102 is cannot rate

-------------------------------------------------------------------------------------------------------------
cu12                         favorability: first major general election challenging candidate for us represen
-------------------------------------------------------------------------------------------------------------

                  type:  numeric (int)
                 label:  cu12, but 33 nonmissing values are not labeled

                 range:  [0,999]                      units:  1
         unique values:  36                       missing .:  2654/3526

              examples:  102   cannot rate
                         .     
                         .     
                         .     
*/

gen inc_favor=cu10;
recode inc_favor 101=. 102=. 999=. ;
label variable inc_favor "Favorability of incumbent";

gen chal_favor=cu12;
recode chal_favor 101=. 102=. 999=. ;
label variable chal_favor "Favorability of challenger";

gen demfav = .;
replace demfav = inc_favor if deminc == 1;
replace demfav = chal_favor if deminc == 0;
label var demfav "Democratic candidate's favorability";

gen gopfav = .;
replace gopfav = chal_favor if deminc == 1;
replace gopfav = inc_favor if deminc == 0;
label var gopfav "Republican candidate's favorability";

gen relativedemfav = demfav-gopfav;
label var relativedemfav "Democrat's favorability - Republican's favorability";

gen alt_relativedemfav=relativedemfav;
replace alt_relativedemfav=demfav-50 if gopfav==.;
replace alt_relativedemfav=gopfav-50 if demfav==.;
replace alt_relativedemfav=. if gopfav==. & demfav==.;
label var alt_relativedemfav "Democrat's favorability - Republican's favorability, missing rating=50";

*we can also code it according to whether the person said did not recog. or refused to rate;
gen recog_chal=cu12;
recode recog_chal 101=0 102=0 999=. .=. *=1;
label variable recog_chal "Rated Challenger (1=yes, 0=Didn't recognize/Can't rate)";

gen offeredname=cu09;
replace offeredname=0 if cu09==998;
label var offeredname "Offered to Name Candidates (1=yes)";

gen str8 strdate = string(cdate,"%9.0f");
gen interviewdate = mdy(real(substr(strdate,5,2)),real(substr(strdate,7,2)),real(substr(strdate,1,4)));

/*This is a district group indicator, for clustering standard errors */
egen groupid = group(st district);

/**************
* Step 4. Recode variables incorporating information on incumbent status
**************/

gen lnchalexp=ln((1+chalexp)/1000);
gen lnincexp=ln((1+incexp)/1000);
label var lnchalexp "Ln(Challenger's Expenditures)";
label var lnincexp "Ln(Incumbent's Expenditures)";

gen ln_dem_fundadv_deminc=deminc* ( ln((1+dexp)/1000) - ln((1+rexp)/1000) );
gen ln_dem_fundadv_repinc=(1-deminc)* ( ln((1+dexp)/1000) - ln((1+rexp)/1000) );

label var ln_dem_fundadv_deminc "Log(Dem. Expenditures) - Log(Rep. Expenditures), Dem. Inc.";
label var ln_dem_fundadv_repinc "Log(Dem. Expenditures) - Log(Rep. Expenditures), Rep. Inc.";

gen lnchalexp_chalpart=lnchalexp*chalparty;
label var lnchalexp_chalpart "Ln(Challenger's Expenditures) x Challenger Party";

gen incumbent_priorvs = dvp;
replace incumbent_priorvs= 100-dvp if chalparty==-1;
label var incumbent_priorvs "Incumbent's prior vote share";

gen incvs_chalpart=incumbent_priorvs*chalparty;
label var incvs_chalpart "Incumbent's prior vote share x Challenger Party";

gen incumbent_priorpresvs = dpres;
replace incumbent_priorpresvs= 100-dpres if chalparty==-1;
label var incumbent_priorpresvs "Incumbent's presidential candidate's prior vote share";

gen incpvs_chalpart=incumbent_priorpresvs*chalparty;
label var incpvs_chalpart "Incumbent's presidential candidate's prior vote share x Challenger Party";

/****************
*Additional recoding
****************/

*Is challenger a woman?;
generate femcand = (st=="CA" & district == 49);
replace femcand = 1 if (st=="FL" & district == 15);
replace femcand = 1 if (st=="FL" & district == 22);
replace femcand = 1 if (st=="ME" & district == 1);
replace femcand = 1 if (st=="KY" & district == 3);
replace femcand = 1 if (st=="NE" & district == 2);
replace femcand = 1 if (st=="NH" & district == 1);
replace femcand = 1 if (st=="NM" & district == 3);
label var femcand "Female Challenger";
gen femparty=-femcand*deminc + femcand*(1-deminc);
label var femparty "Female Challenger x Challenger Party";

*time trend is censored after election day;
gen timetrend=interviewdate-mdy(10,1,2000);
label var timetrend "Time trend";
replace timetrend=mdy(11,7,2000)-mdy(10,1,2000) if (interviewdate-mdy(10,1,2000))>(mdy(11,7,2000)-mdy(10,1,2000));

quietly tab knowledge, gen(knn_);
drop knn_1;

label var knn_2 "Political Knowledge=1";
label var knn_3 "Political Knowledge=2";
label var knn_4 "Political Knowledge=3";

gen termed_knn1=0;
gen termed_knn2=0;
gen termed_knn3=0;
gen termed_knn4=0;

replace termed_knn1=termed if knowledge==0;
replace termed_knn2=termed if knowledge==1;
replace termed_knn3=termed if knowledge==2;
replace termed_knn4=termed if knowledge==3;

label var termed_knn1 "Challenger Term Limited x Political Knowledge=0";
label var termed_knn2 "Challenger Term Limited x Political Knowledge=1";
label var termed_knn3 "Challenger Term Limited x Political Knowledge=2";
label var termed_knn4 "Challenger Term Limited x Political Knowledge=3";

/****************
*Restrict sample
****************/

*keep only interviews between Oct 1 and Nov 30 2000;
gen window = interviewdate>=mdy(10,1,2000) & interviewdate <=mdy(11,30,2000);
keep if window == 1;

*keep only US citizens;
keep if us_citizen==1;

drop  ce01- crz07  cu02- cu08_ cu11 cu13- cy06;

compress;

save data\NAES_ChallengerData_Reduced.dta, replace;

log close;
