diff --git a/README.md b/README.md index e08003d..5760e17 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ The help file can be explored interactively in Stata using `help calipmatch`.
calipmatch [if] [in], generate(newvar) casevar(varname) maxmatches(#) calipermatch(varlist) caliperwidth(numlist) [exactmatch( - varlist)] + varlist) nostandardize]
options Description @@ -44,6 +44,8 @@ The help file can be explored interactively in Stata using `help calipmatch`.
Optional exactmatch(varlist) list of integer variables to match on exactly + nostandardize distance using sum of squares; default is + standardized sum of squares -------------------------------------------------------------------------
@@ -66,11 +68,11 @@ The help file can be explored interactively in Stata using `help calipmatch`.
The cases are processed in random order. For each case, calipmatch searches for matching controls. If any valid matches exist, it selects - the matching control which minimizes the sum of squared differences - across caliper matching variables. If maxmatches(#)>1, then after - completing the search for a first matching control observation for each - case, the algorithm will search for a second matching control observation - for each case, etc. + the matching control which minimizes the standardized sum of squared + differences across caliper matching variables. If maxmatches(#)>1, then + after completing the search for a first matching control observation for + each case, the algorithm will search for a second matching control + observation for each case, etc.
Options @@ -119,6 +121,10 @@ The help file can be explored interactively in Stata using `help calipmatch`. This enables speedy exact matching, by ensuring that all values are stored as precise integers.
+ nostandardize calculates distance between cases and controls using the + sum of squared differences. When specified, matches will be + sensitive to the scale of caliper variables. This can be used to + weight caliper variables.
diff --git a/calipmatch.ado b/calipmatch.ado index 39351f6..41ac156 100644 --- a/calipmatch.ado +++ b/calipmatch.ado @@ -12,7 +12,7 @@ human-readable summary can be accessed at http://creativecommons.org/publicdomai program define calipmatch, sortpreserve rclass version 13.0 - syntax [if] [in], GENerate(name) CASEvar(varname numeric) MAXmatches(numlist integer >0 max=1) CALIPERMatch(varlist numeric) CALIPERWidth(numlist >0) [EXACTmatch(varlist)] + syntax [if] [in], GENerate(name) CASEvar(varname numeric) MAXmatches(numlist integer >0 max=1) CALIPERMatch(varlist numeric) CALIPERWidth(numlist >0) [EXACTmatch(varlist) nostandardize] * Verify there are same number of caliper vars as caliper widths if (`: word count `calipermatch'' != `: word count `caliperwidth'') { @@ -88,9 +88,29 @@ program define calipmatch, sortpreserve rclass tempname case_matches if r(no_matches)==0 { - mata: _calipmatch(boundaries,"`generate'",`maxmatches',"`calipermatch'","`caliperwidth'") + + if "`standardize'"=="" { + * Create standardized caliper vars (subtract mean, divide by SD) + local i = 0 + foreach var of varlist `calipermatch' { + local ++i + + tempvar std_`var' + qui sum `var' in `=_N-`insample_total'+1'/`=_N' + qui gen `std_`var'' = (`var' - r(mean)) / r(sd) in `=_N-`insample_total'+1'/`=_N' + + local std_calipermatch `std_calipermatch' `std_`var'' + local std_caliperwidth `std_caliperwidth' `=`: word `i' of `caliperwidth'' / r(sd)' + } + + mata: _calipmatch(boundaries,"`generate'",`maxmatches',"`std_calipermatch'","`std_caliperwidth'") + } + else { + mata: _calipmatch(boundaries,"`generate'",`maxmatches',"`calipermatch'","`caliperwidth'") + } + qui compress `generate' - + matrix `case_matches' = r(matchsuccess) matrix `case_matches' = (`cases_total' - `case_matches''* J(rowsof(`case_matches'),1,1)) \ `case_matches' } @@ -150,19 +170,19 @@ void _calipmatch(real matrix boundaries, string scalar genvar, real scalar maxma // Outputs: // The values of "genvar" are filled with integers that describe each group of matched cases and controls. // - r(matchsuccess) is a Stata return matrix tabulating the number of cases successfully matched to {1, ..., maxmatch} controls - + real scalar matchgrp matchgrp = st_varindex(genvar) real rowvector matchvars matchvars = st_varindex(tokens(calipvars)) - + real rowvector tolerance tolerance = strtoreal(tokens(calipwidth)) real scalar curmatch curmatch = 0 - + real scalar highestmatch highestmatch = 0 @@ -239,7 +259,7 @@ void _calipmatch(real matrix boundaries, string scalar genvar, real scalar maxma stata("return clear") st_matrix("r(matchsuccess)",matchsuccess) - + } real matrix find_group_boundaries(string scalar grpvars, string scalar casevar, real scalar startobs, real scalar endobs) { diff --git a/calipmatch.sthlp b/calipmatch.sthlp index 7274064..b9d2980 100644 --- a/calipmatch.sthlp +++ b/calipmatch.sthlp @@ -24,7 +24,7 @@ Create a variable indicating groups of matched cases and controls {opt max:matches(#)} {opth caliperm:atch(varlist)} {opth caliperw:idth(numlist)} -[{opth exactm:atch(varlist)}] +[{opth exactm:atch(varlist)} {bf: nostandardize}] {synoptset 23 tabbed}{...} @@ -44,6 +44,7 @@ matching{p_end} {syntab :Optional} {synopt :{opth exactm:atch(varlist)}}list of integer variables to match on exactly{p_end} +{synopt :{bf: nostandardize}} distance using sum of squares; default is standardized sum of squares {p_end} {synoptline} @@ -67,7 +68,7 @@ variables when multiple valid matches exist. {pstd} The cases are processed in random order. For each case, {cmd:calipmatch} searches for matching controls. If -any valid matches exist, it selects the matching control which minimizes the sum of squared differences across +any valid matches exist, it selects the matching control which minimizes the standardized sum of squared differences across caliper matching variables. If {opt maxmatches(#)}>1, then after completing the search for a first matching control observation for each case, the algorithm will search for a second matching control observation for each case, etc. @@ -115,6 +116,8 @@ matching variables, they must also have identical values for every exact matchin {it:int} or {it:long}. This enables speedy exact matching, by ensuring that all values are stored as precise integers. +{phang}{bf: nostandardize} calculates distance between cases and controls using the sum of squared differences. +When specified, matches will be sensitive to the scale of caliper variables. This can be used to weight caliper variables. {marker saved_results}{...} {title:Saved results} diff --git a/test_calipmatch.do b/test_calipmatch.do index 5151395..84a8494 100644 --- a/test_calipmatch.do +++ b/test_calipmatch.do @@ -9,7 +9,7 @@ program define test_calipmatch if (_rc==0) { * Assign arguments to locals using the same syntax as calipmatch - syntax [if] [in], GENerate(name) CASEvar(varname numeric) MAXmatches(numlist integer >0 max=1) CALIPERMatch(varlist numeric) CALIPERWidth(numlist >0) [EXACTmatch(varlist)] + syntax [if] [in], GENerate(name) CASEvar(varname numeric) MAXmatches(numlist integer >0 max=1) CALIPERMatch(varlist numeric) CALIPERWidth(numlist >0) [EXACTmatch(varlist) nostandardize] * Store returned objects local cases_total = r(cases_total) @@ -345,30 +345,107 @@ replace income_percentile = 52 in 3 replace income_percentile = 41 in 4 replace income_percentile = 55 in 5 -gen byte age = 40 -replace age = 47 in 2 -replace age = 55 in 4 +gen int age_days = 14600 +replace age_days = 17155 in 2 +replace age_days = 20075 in 4 + +*---------------------------------------------------------------------------- +* Valid inputs, test performance of matching algorithm +*---------------------------------------------------------------------------- -gen float sse = (income_percentile - income_percentile[1])^2 + (age - age[1])^2 +* matches minimize sum of normalized squares +egen std_income_percentile = std(income_percentile) +egen std_age_days = std(age_days) +gen float std_sse = (std_income_percentile - std_income_percentile[1])^2 + (std_age_days - std_age_days[1])^2 list +test_calipmatch, gen(matchgroup) case(case) maxmatches(1) /// + calipermatch(income_percentile age_days) caliperwidth(100 36500) + +sum std_sse if case==0, meanonly +assert cond(_n==2, std_sse==r(min), std_sse!=r(min)) // test that obs 2 is global min + +assert matchgroup == 1 in 2 // test that obs 2 is matched +assert matchgroup == . in 3/5 + +keep case income_percentile age_days + +* matches minimize sum of squares when nostandardize is specified +gen float sse = (income_percentile - income_percentile[1])^2 + (age_days - age_days[1])^2 +list + +test_calipmatch, gen(matchgroup) case(case) maxmatches(1) /// + calipermatch(income_percentile age_days) caliperwidth(100 36500) nostandardize + +sum sse if case==0, meanonly +assert cond(_n==3, sse==r(min), sse!=r(min)) // test that obs 3 is global min + +assert matchgroup == 1 in 3 // test that obs 3 is matched +assert matchgroup == . in 2 +assert matchgroup == . in 4/5 + +keep case income_percentile age_days + +*============================================================================ +* New dataset: two caliper matching variables, with scaling and a shift +*============================================================================ + +clear +set obs 2000 + +gen byte case=(_n<=200) + +gen byte income_percentile=ceil(runiform() * 100) +gen byte age = 44 + ceil(runiform()*17) +gen int days_over_44 = (age - 44)*365 + *---------------------------------------------------------------------------- * Valid inputs, test performance of matching algorithm *---------------------------------------------------------------------------- -* matches minimize sum of squares -test_calipmatch, gen(matchgroup) case(case) maxmatches(1) /// - calipermatch(income_percentile age) caliperwidth(100 100) +* matches are scale and shift invariant +set seed 4585239 +set sortseed 789045789 -sum sse if case==0, meanonly -assert cond(_n==2, sse==r(min), sse!=r(min)) // test that obs 2 is global min +test_calipmatch, gen(matchgroup_1) case(case) maxmatches(1) /// + calipermatch(income_percentile age) caliperwidth(5 3) -assert matchgroup == 1 in 2 // test that obs 2 is matched -assert matchgroup == . in 3/5 +drop casecount matched_case control matched_controls + +set seed 4585239 +set sortseed 789045789 + +test_calipmatch, gen(matchgroup_2) case(case) maxmatches(1) /// + calipermatch(income_percentile days_over_44) caliperwidth(5 1095) + +drop casecount matched_case control matched_controls + +gen match_diffs_std = abs(matchgroup_1 - matchgroup_2) +su match_diffs_std, meanonly +assert r(max) == 0 + +* matches are scale and shift dependent when nostandardize is specified +set seed 4585239 +set sortseed 789045789 + +test_calipmatch, gen(matchgroup_3) case(case) maxmatches(1) /// + calipermatch(income_percentile age) caliperwidth(5 3) nostandardize + +drop casecount matched_case control matched_controls + +set seed 4585239 +set sortseed 789045789 + +test_calipmatch, gen(matchgroup_4) case(case) maxmatches(1) /// + calipermatch(income_percentile days_over_44) caliperwidth(5 1095) nostandardize + +gen match_diffs = abs(matchgroup_3 - matchgroup_4) +su match_diffs, meanonly +assert r(max) != 0 -keep case income_percentile age +keep case income_percentile age *---------------------------------------------------------------------------- -di "Successfully completed all tests." +di "Successfully completed all tests." \ No newline at end of file