SNAP Library 2.2, User Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 00002 // Community detection algorithms 00003 namespace TSnap { 00004 00005 00006 namespace TSnapDetail { 00007 // GIRVAN-NEWMAN algorithm 00008 // 1. The betweenness of all existing edges in the network is calculated first. 00009 // 2. The edge with the highest betweenness is removed. 00010 // 3. The betweenness of all edges affected by the removal is recalculated. 00011 // 4. Steps 2 and 3 are repeated until no edges remain. 00012 // Girvan M. and Newman M. E. J., Community structure in social and biological networks, Proc. Natl. Acad. Sci. USA 99, 7821-7826 (2002) 00013 // Keep removing edges from Graph until one of the connected components of Graph splits into two. 00014 void CmtyGirvanNewmanStep(PUNGraph& Graph, TIntV& Cmty1, TIntV& Cmty2) { 00015 TIntPrFltH BtwEH; 00016 TBreathFS<PUNGraph> BFS(Graph); 00017 Cmty1.Clr(false); Cmty2.Clr(false); 00018 while (true) { 00019 TSnap::GetBetweennessCentr(Graph, BtwEH); 00020 BtwEH.SortByDat(false); 00021 if (BtwEH.Empty()) { return; } 00022 const int NId1 = BtwEH.GetKey(0).Val1; 00023 const int NId2 = BtwEH.GetKey(0).Val2; 00024 Graph->DelEdge(NId1, NId2); 00025 BFS.DoBfs(NId1, true, false, NId2, TInt::Mx); 00026 if (BFS.GetHops(NId1, NId2) == -1) { // two components 00027 TSnap::GetNodeWcc(Graph, NId1, Cmty1); 00028 TSnap::GetNodeWcc(Graph, NId2, Cmty2); 00029 return; 00030 } 00031 } 00032 } 00033 00034 // Connected components of a graph define clusters 00035 // OutDegH and OrigEdges stores node degrees and number of edges in the original graph 00036 double _GirvanNewmanGetModularity(const PUNGraph& G, const TIntH& OutDegH, const int& OrigEdges, TCnComV& CnComV) { 00037 TSnap::GetWccs(G, CnComV); // get communities 00038 double Mod = 0; 00039 for (int c = 0; c < CnComV.Len(); c++) { 00040 const TIntV& NIdV = CnComV[c](); 00041 double EIn=0, EEIn=0; 00042 for (int i = 0; i < NIdV.Len(); i++) { 00043 TUNGraph::TNodeI NI = G->GetNI(NIdV[i]); 00044 EIn += NI.GetOutDeg(); 00045 EEIn += OutDegH.GetDat(NIdV[i]); 00046 } 00047 Mod += (EIn-EEIn*EEIn/(2.0*OrigEdges)); 00048 } 00049 if (Mod == 0) { return 0; } 00050 else { return Mod/(2.0*OrigEdges); } 00051 } 00052 00053 TIntFltH MapEquationNew2Modules(PUNGraph& Graph, TIntH& Module, TIntFltH& Qi, int a, int b){ 00054 TIntFltH Qi1; 00055 Qi1 = Qi; 00056 float InModule=0.0, OutModule=0.0, Val; 00057 int Mds[2] = {a,b}; 00058 for (int i=0; i<2; i++) { 00059 InModule=0.0, OutModule=0.0; 00060 if (Qi1.IsKey(Mds[i])){ 00061 int CentralModule = Mds[i]; 00062 for (TUNGraph::TEdgeI EI = Graph->BegEI(); EI < Graph->EndEI(); EI++) { 00063 if (Module.GetDat(EI.GetSrcNId()) == Module.GetDat(EI.GetDstNId()) && Module.GetDat(EI.GetDstNId()) == CentralModule) { 00064 InModule += 1.0; 00065 } else if ((Module.GetDat(EI.GetSrcNId()) == CentralModule && Module.GetDat(EI.GetDstNId()) != CentralModule) || (Module.GetDat(EI.GetSrcNId()) != CentralModule && Module.GetDat(EI.GetDstNId()) == CentralModule)) { 00066 OutModule +=1.0; 00067 } 00068 } 00069 Val = 0.0; 00070 if (InModule+OutModule > 0) { 00071 Val = OutModule/(InModule+OutModule); 00072 } 00073 Qi1.DelKey(Mds[i]); 00074 Qi1.AddDat(Mds[i],Val); 00075 } else { 00076 Qi1.DelKey(Mds[i]); 00077 Qi1.AddDat(Mds[i],0.0); 00078 } 00079 } 00080 00081 return Qi1; 00082 } 00083 00084 float Equation(PUNGraph& Graph, TIntFltH& PAlpha,float& SumPAlphaLogPAlpha, TIntFltH& Qi){ 00085 float SumPAlpha = 1.0, SumQi = 0.0, SumQiLogQi=0.0, SumQiSumPAlphaLogQiSumPAlpha = 0.0; 00086 for (int i=0; i<Qi.Len(); i++) { 00087 SumQi += Qi[i]; 00088 SumQiLogQi += Qi[i]*log(Qi[i]); 00089 SumQiSumPAlphaLogQiSumPAlpha += (Qi[i]+SumPAlpha)*log(Qi[i]+SumPAlpha); 00090 } 00091 return (SumQi*log(SumQi)-2*SumQiLogQi-SumPAlphaLogPAlpha+SumQiSumPAlphaLogQiSumPAlpha); 00092 } 00093 00094 } // namespace TSnapDetail 00095 00096 // Maximum modularity clustering by Girvan-Newman algorithm (slow) 00097 // Girvan M. and Newman M. E. J., Community structure in social and biological networks, Proc. Natl. Acad. Sci. USA 99, 7821-7826 (2002) 00098 double CommunityGirvanNewman(PUNGraph& Graph, TCnComV& CmtyV) { 00099 TIntH OutDegH; 00100 const int NEdges = Graph->GetEdges(); 00101 for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { 00102 OutDegH.AddDat(NI.GetId(), NI.GetOutDeg()); 00103 } 00104 double BestQ = -1; // modularity 00105 TCnComV CurCmtyV; 00106 CmtyV.Clr(); 00107 TIntV Cmty1, Cmty2; 00108 while (true) { 00109 TSnapDetail::CmtyGirvanNewmanStep(Graph, Cmty1, Cmty2); 00110 const double Q = TSnapDetail::_GirvanNewmanGetModularity(Graph, OutDegH, NEdges, CurCmtyV); 00111 //printf("current modularity: %f\n", Q); 00112 if (Q > BestQ) { 00113 BestQ = Q; 00114 CmtyV.Swap(CurCmtyV); 00115 } 00116 if (Cmty1.Len()==0 || Cmty2.Len() == 0) { break; } 00117 } 00118 return BestQ; 00119 } 00120 00121 // Rosvall-Bergstrom community detection algorithm based on information theoretic approach. 00122 // See: Rosvall M., Bergstrom C. T., Maps of random walks on complex networks reveal community structure, Proc. Natl. Acad. Sci. USA 105, 1118-1123 (2008) 00123 double Infomap(PUNGraph& Graph, TCnComV& CmtyV){ 00124 TIntH DegH; 00125 TIntFltH PAlpha; // probability of visiting node alpha 00126 TIntH Module; // module of each node 00127 TIntFltH Qi; // probability of leaving each module 00128 float SumPAlphaLogPAlpha = 0.0; 00129 int Br = 0; 00130 const int e = Graph->GetEdges(); 00131 00132 // initial values 00133 for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { 00134 DegH.AddDat(NI.GetId(), NI.GetDeg()); 00135 float d = ((float)NI.GetDeg()/(float)(2*e)); 00136 PAlpha.AddDat(NI.GetId(), d); 00137 SumPAlphaLogPAlpha += d*log(d); 00138 Module.AddDat(NI.GetId(),Br); 00139 Qi.AddDat(Module[Br],1.0); 00140 Br+=1; 00141 } 00142 00143 float MinCodeLength = TSnapDetail::Equation(Graph,PAlpha,SumPAlphaLogPAlpha,Qi); 00144 float NewCodeLength, PrevIterationCodeLength = 0.0; 00145 int OldModule, NewModule; 00146 00147 do { 00148 PrevIterationCodeLength = MinCodeLength; 00149 for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { 00150 MinCodeLength = TSnapDetail::Equation(Graph, PAlpha, SumPAlphaLogPAlpha, Qi); 00151 for (int i=0; i<DegH.GetDat(NI.GetId()); i++) { 00152 OldModule = Module.GetDat(NI.GetId()); 00153 NewModule = Module.GetDat(NI.GetNbrNId(i)); 00154 if (OldModule!=NewModule){ 00155 Module.DelKey(NI.GetId()); 00156 Module.AddDat(NI.GetId(),NewModule); 00157 Qi = TSnapDetail::MapEquationNew2Modules(Graph,Module,Qi,OldModule, NewModule); 00158 NewCodeLength = TSnapDetail::Equation(Graph,PAlpha,SumPAlphaLogPAlpha, Qi); 00159 if (NewCodeLength<MinCodeLength) { 00160 MinCodeLength = NewCodeLength; 00161 OldModule = NewModule; 00162 } else { 00163 Module.DelKey(NI.GetId()); 00164 Module.AddDat(NI.GetId(),OldModule); 00165 } 00166 } 00167 } 00168 } 00169 } while (MinCodeLength<PrevIterationCodeLength); 00170 00171 Module.SortByDat(true); 00172 int Mod=-1; 00173 for (int i=0; i<Module.Len(); i++) { 00174 if (Module[i]>Mod){ 00175 Mod = Module[i]; 00176 TCnCom t; 00177 for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++){ 00178 if (Module.GetDat(NI.GetId())==Mod) 00179 t.Add(NI.GetId()); 00180 } 00181 CmtyV.Add(t); 00182 } 00183 } 00184 00185 return MinCodeLength; 00186 } 00187 00188 namespace TSnapDetail { 00192 class TCNMQMatrix { 00193 private: 00194 struct TCmtyDat { 00195 double DegFrac; 00196 TIntFltH NIdQH; 00197 int MxQId; 00198 TCmtyDat() : MxQId(-1) { } 00199 TCmtyDat(const double& NodeDegFrac, const int& OutDeg) : 00200 DegFrac(NodeDegFrac), NIdQH(OutDeg), MxQId(-1) { } 00201 void AddQ(const int& NId, const double& Q) { NIdQH.AddDat(NId, Q); 00202 if (MxQId==-1 || NIdQH[MxQId]<Q) { MxQId=NIdQH.GetKeyId(NId); } } 00203 void UpdateMaxQ() { MxQId=-1; 00204 for (int i = -1; NIdQH.FNextKeyId(i); ) { 00205 if (MxQId==-1 || NIdQH[MxQId]< NIdQH[i]) { MxQId=i; } } } 00206 void DelLink(const int& K) { const int NId=GetMxQNId(); 00207 NIdQH.DelKey(K); if (NId==K) { UpdateMaxQ(); } } 00208 int GetMxQNId() const { return NIdQH.GetKey(MxQId); } 00209 double GetMxQ() const { return NIdQH[MxQId]; } 00210 }; 00211 private: 00212 THash<TInt, TCmtyDat> CmtyQH; 00213 THeap<TFltIntIntTr> MxQHeap; 00214 TUnionFind CmtyIdUF; 00215 double Q; 00216 public: 00217 TCNMQMatrix(const PUNGraph& Graph) : CmtyQH(Graph->GetNodes()), 00218 MxQHeap(Graph->GetNodes()), CmtyIdUF(Graph->GetNodes()) { Init(Graph); } 00219 void Init(const PUNGraph& Graph) { 00220 const double M = 0.5/Graph->GetEdges(); // 1/2m 00221 Q = 0.0; 00222 for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { 00223 CmtyIdUF.Add(NI.GetId()); 00224 const int OutDeg = NI.GetOutDeg(); 00225 if (OutDeg == 0) { continue; } 00226 TCmtyDat& Dat = CmtyQH.AddDat(NI.GetId(), TCmtyDat(M * OutDeg, OutDeg)); 00227 for (int e = 0; e < NI.GetOutDeg(); e++) { 00228 const int DstNId = NI.GetOutNId(e); 00229 const double DstMod = 2 * M * (1.0 - OutDeg * Graph->GetNI(DstNId).GetOutDeg() * M); 00230 Dat.AddQ(DstNId, DstMod); 00231 } 00232 Q += -1.0*TMath::Sqr(OutDeg*M); 00233 if (NI.GetId() < Dat.GetMxQNId()) { 00234 MxQHeap.Add(TFltIntIntTr(Dat.GetMxQ(), NI.GetId(), Dat.GetMxQNId())); } 00235 } 00236 MxQHeap.MakeHeap(); 00237 } 00238 TFltIntIntTr FindMxQEdge() { 00239 while (true) { 00240 if (MxQHeap.Empty()) { break; } 00241 const TFltIntIntTr TopQ = MxQHeap.PopHeap(); 00242 if (! CmtyQH.IsKey(TopQ.Val2) || ! CmtyQH.IsKey(TopQ.Val3)) { continue; } 00243 if (TopQ.Val1!=CmtyQH.GetDat(TopQ.Val2).GetMxQ() && TopQ.Val1!=CmtyQH.GetDat(TopQ.Val3).GetMxQ()) { continue; } 00244 return TopQ; 00245 } 00246 return TFltIntIntTr(-1, -1, -1); 00247 } 00248 bool MergeBestQ() { 00249 const TFltIntIntTr TopQ = FindMxQEdge(); 00250 if (TopQ.Val1 <= 0.0) { return false; } 00251 // joint communities 00252 const int I = TopQ.Val3; 00253 const int J = TopQ.Val2; 00254 CmtyIdUF.Union(I, J); // join 00255 Q += TopQ.Val1; 00256 TCmtyDat& DatJ = CmtyQH.GetDat(J); 00257 { TCmtyDat& DatI = CmtyQH.GetDat(I); 00258 DatI.DelLink(J); DatJ.DelLink(I); 00259 for (int i = -1; DatJ.NIdQH.FNextKeyId(i); ) { 00260 const int K = DatJ.NIdQH.GetKey(i); 00261 TCmtyDat& DatK = CmtyQH.GetDat(K); 00262 double NewQ = DatJ.NIdQH[i]; 00263 if (DatI.NIdQH.IsKey(K)) { NewQ = NewQ+DatI.NIdQH.GetDat(K); DatK.DelLink(I); } // K connected to I and J 00264 else { NewQ = NewQ-2*DatI.DegFrac*DatK.DegFrac; } // K connected to J not I 00265 DatJ.AddQ(K, NewQ); 00266 DatK.AddQ(J, NewQ); 00267 MxQHeap.PushHeap(TFltIntIntTr(NewQ, TMath::Mn(J,K), TMath::Mx(J,K))); 00268 } 00269 for (int i = -1; DatI.NIdQH.FNextKeyId(i); ) { 00270 const int K = DatI.NIdQH.GetKey(i); 00271 if (! DatJ.NIdQH.IsKey(K)) { // K connected to I not J 00272 TCmtyDat& DatK = CmtyQH.GetDat(K); 00273 const double NewQ = DatI.NIdQH[i]-2*DatJ.DegFrac*DatK.DegFrac; 00274 DatJ.AddQ(K, NewQ); 00275 DatK.DelLink(I); 00276 DatK.AddQ(J, NewQ); 00277 MxQHeap.PushHeap(TFltIntIntTr(NewQ, TMath::Mn(J,K), TMath::Mx(J,K))); 00278 } 00279 } 00280 DatJ.DegFrac += DatI.DegFrac; } 00281 if (DatJ.NIdQH.Empty()) { CmtyQH.DelKey(J); } // isolated community (done) 00282 CmtyQH.DelKey(I); 00283 return true; 00284 } 00285 static double CmtyCMN(const PUNGraph& Graph, TCnComV& CmtyV) { 00286 TCNMQMatrix QMatrix(Graph); 00287 // maximize modularity 00288 while (QMatrix.MergeBestQ()) { } 00289 // reconstruct communities 00290 THash<TInt, TIntV> IdCmtyH; 00291 for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { 00292 IdCmtyH.AddDat(QMatrix.CmtyIdUF.Find(NI.GetId())).Add(NI.GetId()); 00293 } 00294 CmtyV.Gen(IdCmtyH.Len()); 00295 for (int j = 0; j < IdCmtyH.Len(); j++) { 00296 CmtyV[j].NIdV.Swap(IdCmtyH[j]); 00297 } 00298 return QMatrix.Q; 00299 } 00300 }; 00301 00302 } // namespace TSnapDetail 00303 00304 double CommunityCNM(const PUNGraph& Graph, TCnComV& CmtyV) { 00305 return TSnapDetail::TCNMQMatrix::CmtyCMN(Graph, CmtyV); 00306 } 00307 00308 }; //namespace TSnap