I tried to find Stata module to count unique alphabets in word, but can't find one. For example, the word "alberta" has a, b, e, l, r, and t. So the new variable displays 6.
-
Login or Register
- Log in with
* Example generated by -dataex-. For more info, type help dataex
clear
input str20 province
"alberta"
"new brunswick"
"prince edward island"
end
gen length = strlen(province)
su length
quietly forval j = 1/`r(max)' {
gen char`j' = substr(province, `j', 1) if substr(province, `j', 1) != ""
}
egen wanted = rowsvals(char*)
l province wanted
+-------------------------------+
| province wanted |
|-------------------------------|
1. | alberta 6 |
2. | new brunswick 11 |
3. | prince edward island 12 |
+-------------------------------+
drop char*
gen byte nchars = 0 forvalues codepoint = 65/255 { // parts of unicode blocks "Basic Latin" and "Latin-1 Supplement" if ( uisletter(uchar(`codepoint')) ) { replace nchars = nchars + 1 if ustrpos(ustrlower(province), uchar(`codepoint') ) } }
gen char`j' = usubstr(ustrlower(province), `j', 1) if usubstr(ustrlower(province), `j', 1) != ""
clear *
cls
input str30 input
"Ontario"
"Alberta"
"New Brunswick"
"Prince Edward Island"
"München"
"Malmö"
"L'Aquila"
"Emiglia-Romagna"
end
gen textonly = ustrregexra(input, "\P{L}", "", 1)
replace textonly = ustrlower(textonly) // <-- comment line if you care about capitalization
gen textlen = ustrlen(textonly)
gen unique_letters = ""
gen next_letter = ""
gen remaining = textonly
summ textlen, meanonly
forval i = 1/`r(max)' {
qui replace next_letter = usubstr(remaining, 1, 1)
qui replace unique_letters = unique_letters + next_letter
qui replace remaining = ustrregexra(remaining, next_letter, "", 0)
}
drop textonly next_letter remaining
gen n_unique = ustrlen(unique_letters)
list, sep(0) abbrev(20)
+------------------------------------------------------------+
| input textlen unique_letters n_unique |
|------------------------------------------------------------|
1. | Ontario 7 ontari 6 |
2. | Alberta 7 albert 6 |
3. | New Brunswick 12 newbrusick 10 |
4. | Prince Edward Island 18 princedwasl 11 |
5. | München 7 münche 6 |
6. | Malmö 5 malö 4 |
7. | L'Aquila 7 laqui 5 |
8. | Emiglia-Romagna 14 emiglaron 9 |
+------------------------------------------------------------+
Comment