I agree with the last comment of Oded Mcdossi....... that is good
-
Login or Register
- Log in with
capture program drop coo_prx
program coo_prx
syntax anything, sum(integer) Measure(string) // coo_prx matrixname, sum(number_of_cases) m(measure_type)
loc COO `anything' // co-occurance matrix
parse_dissim `measure'
loc measure `s(dist)'
if "`s(binary)'"=="" {
di as error "only binary measures accepted"
exit
}
qui mat list `COO' // assert matrix exists
if issymmetric(`COO')!=1 {
di as error "matrix `C' not symmetric"
exit
}
mata: dissmat("`COO'", `sum', "`measure'")
mat coln PRX=`:colnames `COO''
mat rown PRX=`:rownames `COO''
mat list PRX, f(%8.4f) // proximity matrix stored in PRX
end
mata:
function dissmat(string scalar COO, real scalar sum, string scalar measure)
{
measure=st_local("measure")
COO=st_matrix(COO)
PRX=J(cols(COO),rows(COO),.)
for (i=1;i<=rows(COO);i++) {
for (j=1;j<=rows(COO);j++) {
if (i!=j) {
a=COO[i,j]
b=COO[j,j]; b=b-a
c=COO[i,i]; c=c-a
d=sum-(a+b+c)
if (measure=="matching") PRX[i,j]=(a+d)/(a+b+c+d)
else if (measure=="Jaccard") PRX[i,j]=a/(a+b+c)
else if (measure=="Russell") PRX[i,j]=a/(a+b+c+d)
else if (measure=="Hamann") PRX[i,j]=((a+d)-(b+c))/(a+b+c+d)
else if (measure=="Dice") PRX[i,j]=(2*a)/(2*a+b+c)
else if (measure=="antiDice") PRX[i,j]=a/(a+2*(b+c))
else if (measure=="Sneath") PRX[i,j]=(2*(a+d))/(2*(a+d)+(b+c))
else if (measure=="Rogers") PRX[i,j]=(a+d)/((a+d)+2*(b+c))
else if (measure=="Ochiai") PRX[i,j]=a/sqrt((a+b)*(a+c))
else if (measure=="Yule") PRX[i,j]=(a*d-b*c)/(a*d+b*c)
else if (measure=="Anderberg") PRX[i,j]=(a/(a+b) + a/(a+c) + d/(c+d) + d/(b+d))/4
else if (measure=="Kulczynski") PRX[i,j]=(a/(a+b) + a/(a+c))/2
else if (measure=="Pearson") PRX[i,j]=(a*d-b*c)/sqrt((a+b)*(a+c)*(d+b)*(d+c))
else if (measure=="Gower2") PRX[i,j]=a*d/sqrt((a+b)*(a+c)*(d+b)*(d+c))
else _error(3888)
}
else {
PRX[i,j]=0
}
}
}
st_matrix("PRX",PRX)
}
end
clear
mat drop _all
// Some random data
set obs 10000
loc n=70
forval i=1/`n' {
gen v`i'=floor(2*runiform())
}
// produce co-occurrence matrix
mat C=J(`n',`n',.)
loc nlist ""
forval i=1/`n' {
loc nlist="`nlist' v`i'"
}
mat coln C=`nlist'
mat rown C=`nlist'
forval i=1/`n'{
forval j=1/`n' {
if (`i'!=`j') {
qui count if (v`i'==v`j' & v`i'==1)
mat C[`i',`j']=`r(N)'
}
else {
qui count if v`i'==1
mat C[`i',`i']=`r(N)'
}
}
}
// run the program
coo_prx C, sum(1000) m(match) // sum() takes any positive integers, and m() takes any of the binary similarity measures
capture program drop todistance
program define todistance, rclass // to convert a coocurrence matrix into distances
if "A`2'"=="A" {
local 2 trace(`1')
}
matrix J=diag(J(1,`=rowsof(`1')',1))
matrix K=diag(J(1,`=rowsof(`1')',1))
forvalues X=2/`=rowsof(`1')' {
forvalues Y=1/`=`X'-1' {
matrix O`X'_`Y'=J(2,2,.)
matrix O`X'_`Y'=J(2,2,.)
matrix O`X'_`Y'[1,1]=`1'[`X',`Y']
matrix O`X'_`Y'[1,2]=`1'[`X',`X']-`1'[`X',`Y']
matrix O`X'_`Y'[2,2]=`2'-`1'[`X',`X']-`1'[`Y',`Y']+`1'[`X',`Y']
matrix O`X'_`Y'[2,1]=`1'[`Y',`Y']-`1'[`X',`Y']
matrix rownames O`X'_`Y'=Paper`X'(Si) Paper`X'(No)
matrix colnames O`X'_`Y'=Paper`Y'(Si) Paper`Y'(No)
// matlist O`X'_`Y'
matrix J[`X',`Y']=O`X'_`Y'[1,1]/(O`X'_`Y'[1,1]+O`X'_`Y'[1,2]+O`X'_`Y'[2,1])
matrix J[`Y', `X']=J[`X',`Y']
matrix K[`X',`Y']=((O`X'_`Y'[1,1]/(O`X'_`Y'[1,1]+O`X'_`Y'[1,2]))+(O`X'_`Y'[1,1]/(O`X'_`Y'[1,1]+O`X'_`Y'[2,1])))/2
matrix K[`Y', `X']=K[`X',`Y']
return matrix O`X'_`Y'=O`X'_`Y'
}
}
matrix colnames J=`:colnames(`1')'
matrix rownames J=`:rownames(`1')'
matrix colnames K=`:colnames(`1')'
matrix rownames K=`:rownames(`1')'
matlist J, title("Jaccard distances") format(%3.2f)
matlist K, title("Kulczynski distances")format(%3.2f)
end
// Input the coocurrence matrix with input (or with the data editor).
clear
input p1 p2 p3 p4
60 10 20 25
10 50 30 15
20 30 40 12
25 15 12 30
end
// Convert your data file into a matrix (A)
mkmat p1-p4, mat(A)
matrix rownames A=`:colnames(A)'
//Convert your coocurrence matrix (similarity matrix, but without diagonal of 1's
//into a distance (dissimilarity) matrix
//(Note that in this case you obtain Jaccard -J- and Kulczynski -K-)
todistance A //you may add a number if the total is diferent to the sum of the diagonal
// Once you have J (Jaccard) and K (Kulczynski), you can apply MDS to these distances matrices.
mdsmat J, s2d(standard) // you can add other options as convenient (see help mdsmat)
mdsconfig, name(Jacard, replace)
mdsmat K, s2d(standard)
mdsconfig, name(Kulczynski, replace)
mat A=(.,20,20,10\20,.,15,10\20,15,.,25\10,10,25,.) // fake co-occurrence matrix with missing along the diagonal
mat list A // display matrix A
mata: st_matrix("B",rowsum(st_matrix("A"))) // returns the row sums in 4x1 matrix named B; see *1* for Stata equivalent
// this loop fills the diagonal with row sums
forval i=1/`=colsof(A)' {
mat A[`i',`i']=B[`i',1]
}
// calculate the grand total from the diagonal (could just use matrix B, but let's pretend we don't have B); see *2* for Stata equivalent
mat C=vecdiag(A) // extracts the diagonal in a 1x4 matrix named C
mata: st_local("ttl",strofreal(rowsum(st_matrix("C"))/2)) // calculate grand total and store in Stata local `ttl'
mat list A
/*------------
symmetric A[4,4]
c1 c2 c3 c4
r1 50
r2 20 45
r3 20 15 60
r4 10 10 25 45
----------------*/
/*----------------
v1
1 0
v2 1 a b
0 c d
----------------*/
// calcuate a-d based on A[2,1]
loc a=A[2,1] // 20
loc b=A[2,2]-`a' // 25 (A[2,2] from the diagonal)
loc c=A[1,1]-`a' // 30 (A[1,1] from the diagonal)
loc d=`ttl'-(`a'+`b'+`c') // 25 (`ttl' from the grand total)
di "| a=`a' | b=`b' | c=`c' | d=`d' |"
//----Stata equivalent of mata lines----//
*1*
/*---
mat B=J(`=colsof(A)',1,0)
forval i=1/`=colsof(A)' {
forval j=1/`=colsof(A)' {
if `i'!=`j' {
mat B[`i',1]=B[`i',1]+A[`i',`j']
}
}
}
----*/
*2*
/*----
loc ttl=0
forval i=1/`=colsof(C)' {
loc ttl=`ttl'+A[`i',`i']
}
loc ttl=`ttl'/2
----*/
| PaperA | PaperB | PaperC | PaperD | |
| Paper1 | 0 | 0 | 0 | 0 |
| Paper2 | 1 | 0 | 0 | 0 |
| Paper3 | 1 | 1 | 0 | 0 |
| Paper4 | 1 | 1 | 1 | 0 |
| Paper5 | 1 | 1 | 1 | 1 |
| Paper1 | Paper2 | Paper3 | Paper4 | Paper5 | |
| Paper1 | 0 | 0 | 0 | 0 | 0 |
| Paper2 | 0 | 1 | 1 | 1 | 1 |
| Paper3 | 0 | 1 | 2 | 2 | 2 |
| Paper4 | 0 | 1 | 2 | 3 | 3 |
| Paper5 | 0 | 1 | 2 | 3 | 4 |
net install coin, from(http://sociocav.usal.es/stata/)
. clear
. net install coin, from(http://sociocav.usal.es/stata) replace
. input p1 p2 p3 p4 co
1. 1 1 0 0 10
2. 1 0 1 0 20
3. 1 0 0 1 25
4. 0 1 1 0 30
5. 0 1 0 1 15
6. 0 0 1 1 12
7. end
. coin p1-p4 [fweight=co], f plot(pca) p(.65)
112 scenarios. 2 p<=.65 coincidences amongst 4 events. Density: 0.33
4 events(n>=5): p1 p2 p3 p4
Frequencies | p1 p2 p3 p4
---------------------+----------------------------
p1 | 55
p2 | 10 55
p3 | 20 30 62
p4 | 25 15 12 52
Comment