/*
In contrast to the dataset provided for ÜGK17, the scientific use file for ÜGK16 
available at FORSbase (version 1.0.0, 10.23662/FORS-DS-1004-1 does not include 
a predefined variable for the socio-economic status (SES) of the students. Also,
it does not include multiply imputed data that address missing values. Because 
one of the components of SES (i.e., parent’s highest educational attainment) 
includes a substantial number of missings, the problem of missing values should 
be addressed before generating the variable ses.

This script applies the procedure used for PISA for imputing the three 
components used in the definition of SES and defines the variable ses and qses 
as defined in the technical appendices to the ÜGK report.

Note: The resulting variables will not be identical to ones used for the national report.

Imputation follows: 
PISA 2015 Technical Report, P. 339:
http://www.oecd.org/pisa/data/2015-technical-report/PISA2015_TechRep_Final-Chapter16.pdf
"For students with missing data on one out of the three components [for ÜGK: 
highest education of parents, highest ISEI of parents, number of books at home] 
the missing variable was imputed. Regression on the other two variables was used 
to predict the third (missing) variable, and a random component was added to the
predicted value. If there were missing data on more than one component, ESCS 
[for ÜGK: SES] was not computed and a missing value was assigned for ESCS [for 
ÜGK: SES]."

Recoding of components and definition of SES follows:
Pham, G., Helbling, L., Verner, M., Petrucci, F., Angelone, D., 
& Ambrosetti, A. (2019). ÜGK – COFO – VeCoF 2016 results: Technical appendices. 
St.Gallen & Genf: Pädagogische Hochschule St.Gallen (PHSG) 
& Service de la recherche en éducation (SRED), pp. 4-7.
http://uegk-schweiz.ch/wp-content/uploads/2019/06/UEGK16__Technical-appendices.pdf


Author: Simon Seiler, ICER, University of Bern
Version 1.2; 2021, February 24

Changelog:
* 1.2: hisei08 imputation using truncreg instead of regress
*/

version 16 
// -> should work with older versions of Stata, except direct import of SPSS-data

clear all


cd "H:\Documents\uegk 2016 Daten def forsbase"

// Import SPSS-Data
import spss using "1004_UGK16_Data_E_v1.0.0.sav", clear
egen byte sesmissing = rowmiss(hisei08 fmedu books)

// according to Pham et al. 2019, "other" educational attainments have
// been treated as a missing code when combining mother's and father's 
// educational attainment to the variable "fmedu". Obviously, this was not the
// case as fmedu is "other" if medu or fedu is missing and the other is "other".
// In order to avoid an inconsistent definition of fmedu (and, subsequently, ses)
// we set fmedu to missing here.
mvdecode fmedu, mv(8 = .)


// recode books to books5, according to Pham et al. 2019
recode books 	(1 2 = 0 "0-10 books") ///
				(3   = 1 "11-50 books") ///
				(4   = 2 "51-100 books") ///
				(5   = 3 "101-250 books") ///
				(6 7 = 4 "more than 250 books"), gen(books5)

// impute components for variable ses, according to PISA 2015 Technical Report
mi set wide
mi register imputed hisei08 fmedu books5
sort id_student
set seed 89564
mi impute chained 			                ///
	(truncreg, ll(11.01) ul(88.96)) hisei08 /// limits taken from harryganzeboom.nl/ISCO08/index.htm
	(ologit) fmedu books5 	                ///
		if sesmissing<2 [pw=smp_w_nrastubw], add(1) by(id_canton)


clonevar imp_hisei08 = hisei08
clonevar imp_fmedu   = fmedu
clonevar imp_books5  = books5
replace  imp_hisei08 = _1_hisei08 if hisei08 >= . & _1_hisei08 < .
replace  imp_fmedu   = _1_fmedu   if fmedu   >= . & _1_fmedu   < .
replace  imp_books5  = _1_books5  if books5  >= . & _1_books5  < .

mi unset


label define impflag     ///
	0 "not missing"      ///
	1 "missing, imputed" ///
	2 "missing, not imputed"

generate byte impflag_hisei08 = cond(hisei08<.,0,cond(imp_hisei08<.,1,2))
generate byte impflag_fmedu   = cond(fmedu  <.,0,cond(imp_fmedu  <.,1,2))
generate byte impflag_books5  = cond(books5 <.,0,cond(imp_books5 <.,1,2))

label val impflag_hisei08 impflag_fmedu impflag_books5 impflag


// standardize components
summarize imp_hisei08 [aw=smp_w_nrastubw]
generate zimp_hisei08 = (imp_hisei08 - `r(mean)')/`r(sd)'
summarize imp_fmedu [aw=smp_w_nrastubw]
generate zimp_fmedu = (imp_fmedu - `r(mean)')/`r(sd)'
summarize imp_books5 [aw=smp_w_nrastubw]
generate zimp_books5 = (imp_books5 - `r(mean)')/`r(sd)'

// ses
// - meanscore
egen ses1 = rowmean(zimp_hisei08 zimp_fmedu zimp_books5) ///
	if !missing(zimp_hisei08,zimp_fmedu,zimp_books5)
// - standardize ses
summarize ses1 [aw=smp_w_nrastubw]
generate ses = (ses1 - `r(mean)')/`r(sd)'


// quartiles of ses
xtile qses = ses [pw=smp_w_nrastubw], n(4)
label def qses ///
	1 "SES: 1st quartile" ///
	2 "SES: 2nd quartile" ///
	3 "SES: 3rd quartile" ///
	4 "SES: 4th quartile"
label val qses qses

label define impflagses                                                      ///
	0 "SES based on complete information (no missings in components)"        ///
	1 "SES based on imputed information (one component missing but imputed)" ///
	2 "SES missing (more than one component missing, not imputed)"

generate byte impflag_ses =                                                  ///
	cond(impflag_hisei08==0 & impflag_fmedu==0 & impflag_books5==0,0,        ///
		cond(impflag_hisei08==1 | impflag_fmedu==1 | impflag_books5==1,1,2))
label values impflag_ses impflagses


// label variables
label var books5          "`: var label books', recoded"

label var imp_hisei08     "`: var label hisei08', imputed"
label var imp_fmedu       "`: var label fmedu', imputed"
label var imp_books5      "`: var label books5', imputed"

label var impflag_hisei08 "flags imputed values in imp_hisei08"
label var impflag_fmedu   "flags imputed values in imp_fmedu"
label var impflag_books5  "flags imputed values in imp_books5"
label var impflag_ses     "flags imputed values in ses/qses"

label var ses             "socioeconomic status (SES)"
label var qses            "socioeconomic status (SES), quartiles"

// tidy-up
drop sesmissing *_1_ mi_miss zimp_* ses1

order books5                      ///
      imp_hisei08 impflag_hisei08 ///
	  imp_fmedu   impflag_fmedu   ///
	  imp_books5  impflag_books5  ///
	  ses qses    impflag_ses, last