Thanks to Kit Baum, a new package gtools is now available for download from SSC. From Stata 13.1 or later, use
Gtools implements a faster version of several stata commands:
Syntax is largely analogous to their native counterparts, and several additions are available. See "gtools, examples" for some quick examples and "help gtools" for details and additional features. The package's official website, which includes more detailed examples, benchmarks, and documentation, is https://gtools.readthedocs.io; development is currently undertaken at github (https://github.com/mcaceresb/stata-gtools).
Feedback and comments are welcome! Here are some quick benchmarks to showcase the speed gains (Stata 15/MP):
And against SSC and Stata Journal commands
Code:
ssc install gtools
- gcollapse, 4-60x faster than collapse (2-9x faster than fcollapse)
- gcontract, 2-7x faster than contract
- gegen, 4-26x faster than egen (2-4x faster than fegen)
- gisid, 4-30x faster than isid (4-14x faster than fisid)
- glevelsof, 2-13x faster than levelsof (1.5-13x faster than flevelsof)
- gquantiles, 10-30x faster than xtile (2.5-30x faster than fastxtile)
- gquantiles, 3-40x faster than pctile and _pctile
- gquantiles with by, 3-12x faster than astile with by (next-fastest I found)
- gunique and gdistinct, 4-26x faster than their SSC and Stata Journal counterparts (unique and distinct).
- gduplicates, 5-15x faster than duplicates.
Syntax is largely analogous to their native counterparts, and several additions are available. See "gtools, examples" for some quick examples and "help gtools" for details and additional features. The package's official website, which includes more detailed examples, benchmarks, and documentation, is https://gtools.readthedocs.io; development is currently undertaken at github (https://github.com/mcaceresb/stata-gtools).
Feedback and comments are welcome! Here are some quick benchmarks to showcase the speed gains (Stata 15/MP):
Code:
program bench
gettoken timer call: 0, p(:)
gettoken colon call: call, p(:)
cap timer clear `timer'
timer on `timer'
`call'
timer off `timer'
qui timer list
c_local r`timer' `=r(t`timer')'
end
clear
set obs 10000000
gen groups = int(runiform() * 1000)
gen rsort = rnormal()
gen rvar = rnormal()
gen ix = _n
sort rsort
timer clear
preserve
bench 11: gcollapse (sum) rvar (mean) mean = rvar, by(groups)
restore, preserve
bench 10: collapse (sum) rvar (mean) mean = rvar, by(groups)
restore
preserve
bench 21: gcontract groups
restore, preserve
bench 20: contract groups
restore
bench 31: gegen g_id = group(groups)
bench 30: egen id = group(groups)
bench 41: gisid ix
bench 40: isid ix
bench 51: qui glevelsof groups
bench 50: qui levelsof groups
bench 61: gquantiles g_xtile = rvar, nq(10) xtile
bench 60: xtile xtile = rvar, nq(10)
bench 71: gquantiles g_pctile = rvar, nq(10) pctile
bench 70: pctile pctile = rvar, nq(10)
preserve
bench 81: gduplicates drop groups, force
restore, preserve
bench 80: duplicates drop groups, force
restore
local commands collapse contract egen isid levelsof xtile pctile duplicates
local bench_table `" Versus | Native | gtools | % faster "'
local bench_table `"`bench_table'"' _n(1) `" ---------- | ------ | ------ | -------- "'
forvalues i = 10(10)80 {
gettoken cmd commands: commands
local pct "`:disp %7.2f 100 * (`r`i'' - `r`=`i'+1'') / `r`i'''"
local dnative "`:disp %6.2f `r`i'''"
local dgtools "`:disp %6.2f `r`=`i'+1'''"
local cmd `"`:disp %10s "`cmd'"'"'
local bench_table `"`bench_table'"' _n(1) `" `cmd' | `dnative' | `dgtools' | `pct'% "'
}
disp _n(1) `"`bench_table'"'
Code:
Versus | Native | gtools | % faster
---------- | ------ | ------ | --------
collapse | 8.80 | 2.84 | 67.73%
contract | 9.61 | 1.48 | 84.56%
egen | 13.18 | 1.66 | 87.42%
isid | 33.18 | 2.44 | 92.65%
levelsof | 3.12 | 0.87 | 72.04%
xtile | 31.13 | 1.40 | 95.52%
pctile | 7.85 | 1.43 | 81.77%
duplicates | 36.95 | 1.91 | 94.83%
Code:
ssc install ftools
ssc install fastxtile
ssc install distinct
ssc install unique
drop g_id id g_xtile xtile
timer clear
preserve
bench 11: gcollapse (sum) rvar (mean) mean = rvar, by(groups)
restore, preserve
bench 10: fcollapse (sum) rvar (mean) mean = rvar, by(groups)
restore
bench 21: gegen g_id = group(groups)
bench 20: fegen id = group(groups)
bench 31: gisid ix
bench 30: fisid ix
bench 41: qui glevelsof groups
bench 40: qui flevelsof groups
bench 51: gquantiles g_xtile = rvar, nq(10) xtile
bench 50: fastxtile xtile = rvar, nq(10)
bench 61: qui gdistinct groups
bench 60: qui distinct groups
bench 71: qui gunique groups
bench 70: qui unique groups
local commands fcollapse fegen fisid flevelsof fastxtile distinct unique
local bench_table `" Versus | Native | gtools | % faster "'
local bench_table `"`bench_table'"' _n(1) `" ---------- | ------ | ------ | -------- "'
forvalues i = 10(10)70 {
gettoken cmd commands: commands
local pct "`:disp %7.2f 100 * (`r`i'' - `r`=`i'+1'') / `r`i'''"
local dnative "`:disp %6.2f `r`i'''"
local dgtools "`:disp %6.2f `r`=`i'+1'''"
local cmd `"`:disp %10s "`cmd'"'"'
local bench_table `"`bench_table'"' _n(1) `" `cmd' | `dnative' | `dgtools' | `pct'% "'
}
disp _n(1) `"`bench_table'"'
Code:
Versus | Native | gtools | % faster
---------- | ------ | ------ | --------
fcollapse | 7.90 | 3.09 | 60.89%
fegen | 2.13 | 1.67 | 21.87%
fisid | 6.21 | 2.37 | 61.84%
flevelsof | 1.15 | 0.85 | 25.94%
fastxtile | 6.90 | 1.72 | 75.04%
distinct | 12.34 | 0.87 | 92.98%
unique | 9.73 | 0.85 | 91.24%

Comment